mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
move langtests and unlvtests from tesseract-ocr repository to test repository
This commit is contained in:
parent
cbef2ebe12
commit
cdfb768010
@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest
|
||||
|
||||
EXTRA_DIST = README.md LICENSE
|
||||
EXTRA_DIST += aclocal.m4 config configure.ac autogen.sh
|
||||
EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
|
||||
EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc
|
||||
EXTRA_DIST += CMakeLists.txt tesseract.pc.cmake cmake VERSION src/vs2010 cppan.yml
|
||||
|
||||
DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)
|
||||
|
@ -463,7 +463,6 @@ fi
|
||||
|
||||
# Output files
|
||||
AC_CONFIG_FILES([Makefile tesseract.pc])
|
||||
AC_CONFIG_FILES([langtests/Makefile])
|
||||
AC_CONFIG_FILES([src/api/Makefile])
|
||||
AC_CONFIG_FILES([src/api/tess_version.h])
|
||||
AC_CONFIG_FILES([src/arch/Makefile])
|
||||
@ -481,7 +480,6 @@ AC_CONFIG_FILES([src/wordrec/Makefile])
|
||||
AC_CONFIG_FILES([tessdata/Makefile])
|
||||
AC_CONFIG_FILES([tessdata/configs/Makefile])
|
||||
AC_CONFIG_FILES([tessdata/tessconfigs/Makefile])
|
||||
AC_CONFIG_FILES([unlvtests/Makefile])
|
||||
AC_CONFIG_FILES([unittest/Makefile])
|
||||
AC_CONFIG_FILES([java/Makefile])
|
||||
AC_CONFIG_FILES([java/com/Makefile])
|
||||
|
2
langtests/.gitignore
vendored
2
langtests/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
#
|
||||
results/*
|
@ -1,8 +0,0 @@
|
||||
|
||||
EXTRA_DIST = README.md
|
||||
EXTRA_DIST += frk_setup.sh
|
||||
EXTRA_DIST += frk_test.sh
|
||||
EXTRA_DIST += counttestset.sh
|
||||
EXTRA_DIST += runlangtests.sh
|
||||
EXTRA_DIST += runtestset.sh
|
||||
EXTRA_DIST += reports/*
|
@ -1,54 +0,0 @@
|
||||
# Language tests.
|
||||
The scripts in this directory make it possible to test Accuracy of Tesseract for different languages.
|
||||
## Setup
|
||||
### Step 1: If not already installed, download the modified ISRI toolkit,
|
||||
make and install the tools in /usr/local/bin.
|
||||
```
|
||||
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
|
||||
cd ~/ocr-evaluation-tools
|
||||
sudo make install
|
||||
```
|
||||
### Step 2: If not alrady built, Build tesseract.
|
||||
Use binaries from the tesseract/src/api and tesseract/src/training directory.
|
||||
### Step 3
|
||||
Download images and corresponding ground truth text for the language to be tested.
|
||||
Each testset should have only one kind of images (eg. tif, png, jpg etc).
|
||||
The ground truth text files should have the same base filename with txt extension.
|
||||
As needed, modify the filenames and create the `pages` file for each testset.
|
||||
Instructions for testing Fraktur and Sanskrit languages are given below as an example.
|
||||
## Testing for Fraktur - frk and script/Fraktur
|
||||
### Download the images and groundtruth, modify to required format.
|
||||
```
|
||||
bash -x frk_setup.sh
|
||||
```
|
||||
### Run tests for Fraktur - frk and script/Fraktur
|
||||
```
|
||||
bash -x frk_test.sh
|
||||
```
|
||||
## Testing for Sanskrit - san and script/Devanagari
|
||||
### Download the images and groundtruth, modify to required format.
|
||||
```
|
||||
bash -x deva_setup.sh
|
||||
```
|
||||
### Run tests
|
||||
```
|
||||
bash -x deva_test.sh
|
||||
```
|
||||
|
||||
### Notes from Nick White regarding wordacc
|
||||
|
||||
If you just want to remove all lines which have 100% recognition,
|
||||
you can add a 'awk' command like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
|
||||
results.txt
|
||||
|
||||
or if you've already got a results file you want to change, you can do this:
|
||||
|
||||
awk '$3 != 100 {print $0}' results.txt newresults.txt
|
||||
|
||||
If you only want the last sections where things are broken down by
|
||||
word, you can add a sed commend, like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
|
||||
!d' | awk '$3 != 100 {print $0}' results.txt
|
@ -1,52 +0,0 @@
|
||||
#!/bin/bash
|
||||
# File: counttestset.sh
|
||||
# Description: Script to count the errors on a single UNLV set.
|
||||
# Author: Ray Smith
|
||||
# Created: Wed Jun 13 11:58:01 PDT 2007
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 2 ]
|
||||
then
|
||||
echo "Usage:$0 pagesfile langcode"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
pages=$1
|
||||
langcode=$2
|
||||
|
||||
imdir=${pages%/pages}
|
||||
setname=${imdir##*/}
|
||||
resdir=langtests/results/$setname
|
||||
mkdir -p langtests/reports
|
||||
echo "Counting on set $setname in directory $imdir to $resdir"
|
||||
accfiles=""
|
||||
wafiles=""
|
||||
while read page dir
|
||||
do
|
||||
if [ "$dir" ]
|
||||
then
|
||||
srcdir="$imdir/$dir"
|
||||
else
|
||||
srcdir="$imdir"
|
||||
fi
|
||||
echo "$srcdir/$page"
|
||||
# Count character errors.
|
||||
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
|
||||
accfiles="$accfiles $resdir/$page.acc"
|
||||
# Count word errors.
|
||||
ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
|
||||
wafiles="$wafiles $resdir/$page.wa"
|
||||
done <"$pages"
|
||||
|
||||
accsum $accfiles >"langtests/results/$setname.characc"
|
||||
wordaccsum $wafiles >"langtests/results/$setname.wordacc"
|
@ -1,18 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
mkdir -p ~/lang-files
|
||||
rm -rf ~/lang-files/san-*
|
||||
for testset in vedic fontsamples oldstyle shreelipi alphabetsamples
|
||||
do
|
||||
cd ~/lang-files
|
||||
mkdir -p ./san-$testset
|
||||
cp ~/lang-deva-downloads/imagessan/$testset/*.* ./san-$testset/
|
||||
cd ./san-$testset/
|
||||
rename s/-gt.txt/.txt/ *.txt
|
||||
ls -1 *.png >pages
|
||||
sed -i -e 's/.png//g' pages
|
||||
done
|
||||
|
||||
mkdir -p ~/lang-stopwords
|
||||
cd ~/lang-stopwords
|
||||
cp ~/lang-deva-downloads/imagessan/stopwords.txt ./san.stopwords.txt
|
@ -1,18 +0,0 @@
|
||||
#!/bin/bash
|
||||
# run langtests/runlangtests.sh with the root data dir, testname, tessdata-dir, language code and image extension
|
||||
|
||||
cd ~/tesseract
|
||||
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_Devanagari ../tessdata_fast/script Devanagari png
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_int_Devanagari ../tessdata/script Devanagari png
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_Devanagari ../tessdata_best/script Devanagari png
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_san ../tessdata_fast san png
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_int_san ../tessdata san png
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_san ../tessdata_best san png
|
||||
|
||||
langtests/runlangtests.sh ~/lang-files 4_plus40k_san ../tesstutorial-deva san png
|
||||
|
||||
#/home/ubuntu/tesstutorial-deva/san.traineddata at n iterations
|
||||
|
||||
### It takes a while to run.
|
||||
|
@ -1,24 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
mkdir -p ~/lang-downloads
|
||||
cd ~/lang-downloads
|
||||
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
|
||||
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
|
||||
|
||||
mkdir -p ~/lang-files
|
||||
cd ~/lang-files
|
||||
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
|
||||
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
|
||||
mkdir -p ./frk-ligatures
|
||||
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
|
||||
cp ./frk/gt/*.txt ./frk-ligatures/
|
||||
|
||||
cd ./frk-ligatures/
|
||||
ls -1 *.tif >pages
|
||||
sed -i -e 's/.tif//g' pages
|
||||
|
||||
mkdir -p ~/lang-stopwords
|
||||
cd ~/lang-stopwords
|
||||
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
|
||||
|
||||
echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."
|
@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
|
||||
|
||||
cd ~/tesseract
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur tif
|
||||
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk tif
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk tif
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk tif
|
||||
|
||||
### It takes a while to run.
|
||||
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_Devanagari san-alphabetsamples 2013 56.17% 1323 12.27% 1323 12.27 606.28s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_Devanagari san-fontsamples 388 94.82% 87 86.38% 87 86.38 570.17s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_Devanagari san-oldstyle 2796 59.93% 523 39.61% 523 39.61 447.73s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_Devanagari san-shreelipi 830 94.01% 311 81.40% 311 81.40 1137.51s
|
@ -1,2 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_Devanagari san-alphabetsamples 2010 56.24% 1321 12.40% 1321 12.40 556.26s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_Devanagari san-fontsamples 396 94.72% 89 86.07% 89 86.07 524.07s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_Devanagari san-oldstyle 2812 59.70% 523 39.61% 523 39.61 416.57s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_Devanagari san-shreelipi 829 94.01% 314 81.22% 314 81.22 1087.02s
|
@ -1,2 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 367.73s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_san san-alphabetsamples 2342 49.01% 1353 10.28% 1353 10.28 281.60s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_san san-fontsamples 474 93.68% 126 80.28% 126 80.28 281.05s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 206.20s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 606.80s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_san san-alphabetsamples 2335 49.16% 1348 10.61% 1348 10.61 300.24s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_san san-fontsamples 473 93.69% 126 80.28% 126 80.28 267.05s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_san san-oldstyle 3121 55.27% 598 30.95% 598 30.95 205.28s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_san san-shreelipi 1168 91.56% 414 75.24% 414 75.24 610.52s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_Devanagari san-alphabetsamples 2017 56.09% 1317 12.67% 1317 12.67 400.38s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_Devanagari san-fontsamples 433 94.22% 108 83.10% 108 83.10 287.48s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_Devanagari san-oldstyle 2883 58.68% 543 37.30% 543 37.30 289.85s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_Devanagari san-shreelipi 750 94.58% 279 83.31% 279 83.31 813.19s
|
@ -1,2 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s
|
@ -1,2 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_san san-alphabetsamples 2342 49.01% 1353 10.28% 1353 10.28 276.73s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_san san-fontsamples 474 93.68% 126 80.28% 126 80.28 278.34s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 222.35s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 626.40s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus10k_san san-alphabetsamples 1725 62.44% 1112 26.26% 1112 26.26 160.48s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus10k_san san-fontsamples 349 95.34% 73 88.58% 73 88.58 138.09s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus10k_san san-oldstyle 2818 59.62% 548 36.72% 548 36.72 120.83s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus10k_san san-shreelipi 746 94.61% 279 83.31% 279 83.31 292.70s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus20k_san san-alphabetsamples 1441 68.63% 841 44.23% 841 44.23 156.57s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus20k_san san-fontsamples 356 95.25% 75 88.26% 75 88.26 135.13s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus20k_san san-oldstyle 2862 58.99% 555 35.91% 555 35.91 118.21s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus20k_san san-shreelipi 726 94.76% 267 84.03% 267 84.03 295.68s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus30k_san san-alphabetsamples 1656 63.95% 937 37.86% 937 37.86 615.62s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus30k_san san-fontsamples 429 94.28% 89 86.07% 89 86.07 617.42s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus30k_san san-oldstyle 2885 58.66% 561 35.22% 561 35.22 432.58s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus30k_san san-shreelipi 447 96.77% 123 92.64% 123 92.64 1081.29s
|
@ -1,8 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus40k_san san-alphabetsamples 1380 69.95% 775 48.61% 775 48.61 1198.16s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus40k_san san-fontsamples 401 94.65% 79 87.64% 79 87.64 1275.08s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus40k_san san-oldstyle 2860 59.01% 534 38.34% 534 38.34 977.65s
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_plus40k_san san-shreelipi 441 96.81% 113 93.24% 113 93.24 2301.53s
|
@ -1,105 +0,0 @@
|
||||
#!/bin/bash
|
||||
##############################################################################
|
||||
# File: runlangtests.sh
|
||||
# Description: Script to run a set of accuracy test sets for any language.
|
||||
# based on runalltests.sh by Ray Smith
|
||||
# Author: Shree Devi Kumar
|
||||
# Created: June 09, 2018
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
##############################################################################
|
||||
if [ $# -ne 5 ]
|
||||
then
|
||||
echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode imgext"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tessdata=$3
|
||||
lang=$4
|
||||
imgext=$5
|
||||
|
||||
#timesum computes the total cpu time
|
||||
timesum() {
|
||||
awk ' BEGIN {
|
||||
total = 0.0;
|
||||
}
|
||||
{
|
||||
total += $2;
|
||||
}
|
||||
END {
|
||||
printf("%.2f\n", total);
|
||||
}' "$1"
|
||||
}
|
||||
|
||||
imdir="$1"
|
||||
vid="$2"
|
||||
bindir=${0%/*}
|
||||
if [ "$bindir" = "$0" ]
|
||||
then
|
||||
bindir="./"
|
||||
fi
|
||||
rdir=langtests/reports
|
||||
if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ]
|
||||
then
|
||||
testsets="frk-ligatures"
|
||||
fi
|
||||
if [ "$lang" = "san" ] || [ "$lang" = "Devanagari" ]
|
||||
then
|
||||
testsets="san-fontsamples san-oldstyle san-shreelipi san-alphabetsamples"
|
||||
### testsets="san-fontsamples"
|
||||
fi
|
||||
|
||||
totalerrs=0
|
||||
totalwerrs=0
|
||||
totalnswerrs=0
|
||||
for set in $testsets
|
||||
do
|
||||
resdir=langtests/results/$set
|
||||
mkdir -p "$resdir"
|
||||
cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
|
||||
if [ -r "$imdir/$set/pages" ]
|
||||
then
|
||||
# Run tesseract on all the pages.
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" "$imgext"
|
||||
# Count the errors on all the pages.
|
||||
$bindir/counttestset.sh "$imdir/$set/pages" $lang
|
||||
# Get the new character word and nonstop word errors and accuracy.
|
||||
cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c10-17 |tr -d '[:blank:]')
|
||||
nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c19-26 |tr -d '[:blank:]')
|
||||
|
||||
sumfile=$rdir/$vid.$set.sum
|
||||
if [ -r "langtests/results/$set.times" ]
|
||||
then
|
||||
total_time=$(timesum "langtests/results/$set.times")
|
||||
else
|
||||
total_time='0.0'
|
||||
fi
|
||||
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
|
||||
NonStopWErrors Accuracy TimeTaken">"$sumfile"
|
||||
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
|
||||
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
|
||||
fi
|
||||
done
|
||||
|
||||
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
||||
|
||||
mv "$rdir/$vid".*.sum langtests/results/
|
||||
cat "$rdir/$vid".summary
|
@ -1,61 +0,0 @@
|
||||
#!/bin/bash
|
||||
# File: runtestset.sh
|
||||
# Description: Script to run tesseract on a single UNLV set.
|
||||
# Author: Ray Smith
|
||||
# Created: Wed Jun 13 10:13:01 PDT 2007
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 4 ]
|
||||
then
|
||||
echo "Usage:$0 pagesfile tessdata-dir langcode imgext"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tess="time -f %U -o times.txt ./src/api/tesseract"
|
||||
|
||||
tessdata=$2
|
||||
langcode=$3
|
||||
imgext=$4
|
||||
pages=$1
|
||||
imdir=${pages%/pages}
|
||||
setname=${imdir##*/}
|
||||
|
||||
config=""
|
||||
resdir=langtests/results/$setname
|
||||
|
||||
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
|
||||
mkdir -p "$resdir"
|
||||
rm -f "langtests/results/$setname.times"
|
||||
while read page dir
|
||||
do
|
||||
# A pages file may be a list of files with subdirs or maybe just
|
||||
# a plain list of files so accommodate both.
|
||||
if [ "$dir" ]
|
||||
then
|
||||
srcdir="$imdir/$dir"
|
||||
else
|
||||
srcdir="$imdir"
|
||||
fi
|
||||
echo "$srcdir/$page"
|
||||
$tess "$srcdir/$page.$imgext" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
|
||||
if [ -r times.txt ]
|
||||
then
|
||||
read t <times.txt
|
||||
echo "$page $t" >>"langtests/results/$setname.times"
|
||||
echo -e "\033M$page $t"
|
||||
if [ "$t" = "Command terminated by signal 2" ]
|
||||
then
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
done <"$pages"
|
@ -1,12 +0,0 @@
|
||||
|
||||
EXTRA_DIST = README.md
|
||||
EXTRA_DIST += counttestset.sh
|
||||
EXTRA_DIST += runalltests.sh
|
||||
EXTRA_DIST += runalltests_spa.sh
|
||||
EXTRA_DIST += runtestset.sh
|
||||
EXTRA_DIST += reports/1995.bus.3B.sum
|
||||
EXTRA_DIST += reports/1995.doe3.3B.sum
|
||||
EXTRA_DIST += reports/1995.mag.3B.sum
|
||||
EXTRA_DIST += reports/1995.news.3B.sum
|
||||
EXTRA_DIST += reports/2.03.summary
|
||||
EXTRA_DIST += reports/2.04.summary
|
@ -1,94 +0,0 @@
|
||||
## How to run UNLV tests.
|
||||
|
||||
The scripts in this directory make it possible to duplicate the tests
|
||||
published in the Fourth Annual Test of OCR Accuracy.
|
||||
See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf
|
||||
but first you have to get the tools and data used by UNLV:
|
||||
|
||||
### Step 1: to download the images go to
|
||||
https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/
|
||||
and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
|
||||
spn.3B.tar.gz is incorrect in this repo, so get it from code.google
|
||||
|
||||
```
|
||||
mkdir -p ~/isri-downloads
|
||||
cd ~/isri-downloads
|
||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
|
||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
|
||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
|
||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
|
||||
curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
|
||||
```
|
||||
|
||||
### Step 2: extract the files.
|
||||
It doesn't really matter where
|
||||
in your filesystem you put them, but they must go under a common
|
||||
root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example,
|
||||
~/ISRI-OCRtk.
|
||||
|
||||
```
|
||||
mkdir -p ~/ISRI-OCRtk
|
||||
cd ~/ISRI-OCRtk
|
||||
tar xzvf ~/isri-downloads/bus.3B.tar.gz
|
||||
tar xzvf ~/isri-downloads/doe3.3B.tar.gz
|
||||
tar xzvf ~/isri-downloads/mag.3B.tar.gz
|
||||
tar xzvf ~/isri-downloads/news.3B.tar.gz
|
||||
tar xzvf ~/isri-downloads/spn.3B.tar.gz
|
||||
mkdir -p stopwords
|
||||
cd stopwords
|
||||
wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
|
||||
```
|
||||
Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
|
||||
wordacc uses a space delimited stopwords file, not line delimited.
|
||||
s/\n/ /g
|
||||
|
||||
Edit ~/ISRI-OCRtk/spn.3B/pages
|
||||
Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717).
|
||||
|
||||
7733_005.3B 3
|
||||
|
||||
### Step 3: Download the modified ISRI toolkit, make and install the tools :
|
||||
These will be installed in /usr/local/bin.
|
||||
|
||||
```
|
||||
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
|
||||
cd ~/ocr-evaluation-tools
|
||||
sudo make install
|
||||
```
|
||||
|
||||
### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
|
||||
|
||||
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:
|
||||
|
||||
```
|
||||
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
|
||||
```
|
||||
and go to the gym, have lunch etc. It takes a while to run.
|
||||
|
||||
### Step 6: There should be a RELEASE.summary file
|
||||
*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
|
||||
report and comparison with the 1995 results.
|
||||
|
||||
### Step 7: run the test for Spanish.
|
||||
|
||||
```
|
||||
unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
|
||||
```
|
||||
|
||||
#### Notes from Nick White regarding wordacc
|
||||
|
||||
If you just want to remove all lines which have 100% recognition,
|
||||
you can add a 'awk' command like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
|
||||
results.txt
|
||||
|
||||
or if you've already got a results file you want to change, you can do this:
|
||||
|
||||
awk '$3 != 100 {print $0}' results.txt newresults.txt
|
||||
|
||||
If you only want the last sections where things are broken down by
|
||||
word, you can add a sed command, like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
|
||||
!d' | awk '$3 != 100 {print $0}' results.txt
|
@ -1,68 +0,0 @@
|
||||
#!/bin/bash
|
||||
# File: counttestset.sh
|
||||
# Description: Script to count the errors on a single UNLV set.
|
||||
# Author: Ray Smith
|
||||
# Created: Wed Jun 13 11:58:01 PDT 2007
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 2 ]
|
||||
then
|
||||
echo "Usage:$0 pagesfile langcode"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d src/api ]
|
||||
then
|
||||
echo "Run $0 from the tesseract-ocr root directory!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
pages=$1
|
||||
langcode=$2
|
||||
|
||||
imdir=${pages%/pages}
|
||||
setname=${imdir##*/}
|
||||
resdir=unlvtests/results/$setname
|
||||
mkdir -p unlvtests/reports
|
||||
echo "Counting on set $setname in directory $imdir to $resdir"
|
||||
accfiles=""
|
||||
wafiles=""
|
||||
while read page dir
|
||||
do
|
||||
if [ "$dir" ]
|
||||
then
|
||||
srcdir="$imdir/$dir"
|
||||
else
|
||||
srcdir="$imdir"
|
||||
fi
|
||||
#echo "$srcdir/$page.tif"
|
||||
# Convert groundtruth and recognized text to UTF-8 to correctly treat accented letters.
|
||||
iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
|
||||
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
|
||||
# Count character errors.
|
||||
ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
|
||||
accfiles="$accfiles $resdir/$page.acc"
|
||||
# Count word errors.
|
||||
#langcode should be either eng or spa
|
||||
if [ "$langcode" = "eng" ]
|
||||
then
|
||||
ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
|
||||
else
|
||||
cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
|
||||
ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
|
||||
fi
|
||||
wafiles="$wafiles $resdir/$page.wa"
|
||||
done <"$pages"
|
||||
|
||||
accsum $accfiles >"unlvtests/results/$setname.characc"
|
||||
wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
|
||||
|
@ -1,53 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 1 ]
|
||||
then
|
||||
echo "Usage:$0 scantype"
|
||||
echo "UNLV data comes in several scan types:"
|
||||
echo "3B=300 dpi binary"
|
||||
echo "3A=adaptive thresholded 300 dpi"
|
||||
echo "3G=300 dpi grey"
|
||||
echo "4B=400dpi binary"
|
||||
echo "2B=200dpi binary"
|
||||
echo "For now we only use 3B"
|
||||
exit 1
|
||||
fi
|
||||
ext=$1
|
||||
|
||||
#There are several test sets without meaningful names, so rename
|
||||
#them with something a bit more meaningful.
|
||||
#Each s is oldname/newname
|
||||
for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
|
||||
do
|
||||
old=${s%/*}
|
||||
#if this set was downloaded then process it.
|
||||
if [ -r "$old/PAGES" ]
|
||||
then
|
||||
new=${s#*/}.$ext
|
||||
mkdir -p "$new"
|
||||
echo "Set $old -> $new"
|
||||
#The pages file had - instead of _ so fix it and add the extension.
|
||||
for page in $(cat $old/PAGES)
|
||||
do
|
||||
echo "${page%-*}_${page#*-}.$ext"
|
||||
done >"$new/pages"
|
||||
for f in $(cat "$new/pages")
|
||||
do
|
||||
#Put a tif extension on the tif files.
|
||||
cp "$old/${old}_B/$f" "$new/$f.tif"
|
||||
#Put a uzn extension on the zone files.
|
||||
cp "$old/${old}_B/${f}Z" "$new/$f.uzn"
|
||||
#Cat all the truth files together and put into a single txt file.
|
||||
cat "$old/${old}_GT/${f%.$ext}".Z* >"$new/$f.txt"
|
||||
done
|
||||
fi
|
||||
done
|
@ -1 +0,0 @@
|
||||
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
|
@ -1 +0,0 @@
|
||||
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
|
@ -1 +0,0 @@
|
||||
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
|
@ -1 +0,0 @@
|
||||
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
|
@ -1,9 +0,0 @@
|
||||
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
|
||||
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
|
||||
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
|
||||
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
|
||||
2.03 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26%
|
||||
2.03 doe3.3B 29520 97.98% -18.79% 7966 96.27% 1.79% 6764 95.07 -3.95%
|
||||
2.03 mag.3B 14568 97.81% -3.16% 4288 96.25% -6.09% 3054 95.47 -9.62%
|
||||
2.03 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57%
|
||||
2.03 Total 58165 - -8.81% 15734 - -1.47% 12387 - -6.27%
|
@ -1,9 +0,0 @@
|
||||
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
|
||||
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
|
||||
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
|
||||
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
|
||||
2.04 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26%
|
||||
2.04 doe3.3B 29514 97.98% -18.80% 7963 96.27% 1.75% 6762 95.07 -3.98%
|
||||
2.04 mag.3B 14568 97.81% -3.16% 4289 96.25% -6.07% 3053 95.47 -9.65%
|
||||
2.04 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57%
|
||||
2.04 Total 58159 - -8.82% 15732 - -1.48% 12384 - -6.30%
|
@ -1,2 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
|
||||
4_best_int_spa spn.3B 2846 99.18% 937 98.39% 739 97.54 6478.02s
|
@ -1,2 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
|
||||
4_best_spa spn.3B 2823 99.19% 924 98.41% 729 97.57 7233.76s
|
@ -1,9 +0,0 @@
|
||||
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
|
||||
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
|
||||
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
|
||||
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
|
||||
4_fast_eng bus.3B 6124 98.11% 2.77% 1138 97.88% -30.23% 963 97.05 -25.52% 3935.26s
|
||||
4_fast_eng doe3.3B 30029 97.96% -17.39% 13781 94.45% 76.09% 13178 92.38 87.13% 18847.36s
|
||||
4_fast_eng mag.3B 10934 98.37% -27.32% 3343 97.15% -26.78% 2813 96.06 -16.75% 6867.14s
|
||||
4_fast_eng news.3B 5734 98.84% -10.85% 1322 98.45% -32.07% 1040 97.94 -30.76% 5527.38s
|
||||
4_fast_eng Total 52821 - -17.19% 19584 - 22.64% 17994 - 36.15%
|
@ -1,2 +0,0 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
|
||||
4_fast_spa spn.3B 2841 99.18% 879 98.49% 742 97.53 3838.82s
|
@ -1,135 +0,0 @@
|
||||
#!/bin/bash
|
||||
# File: runalltests.sh
|
||||
# Description: Script to run a set of UNLV test sets for English.
|
||||
# Author: Ray Smith
|
||||
# Created: Thu Jun 14 08:21:01 PDT 2007
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 3 ]
|
||||
then
|
||||
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d src/api ]
|
||||
then
|
||||
echo "Run $0 from the tesseract-ocr root directory!"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
|
||||
then
|
||||
echo "Please build tesseract before running $0"
|
||||
exit 1
|
||||
fi
|
||||
tessdata=$3
|
||||
|
||||
#deltapc new old calculates the %change from old to new
|
||||
deltapc() {
|
||||
awk ' BEGIN {
|
||||
printf("%.2f", 100.0*('"$1"'-'"$2"')/'"$2"');
|
||||
}'
|
||||
}
|
||||
|
||||
#timesum computes the total cpu time
|
||||
timesum() {
|
||||
awk ' BEGIN {
|
||||
total = 0.0;
|
||||
}
|
||||
{
|
||||
total += $2;
|
||||
}
|
||||
END {
|
||||
printf("%.2f\n", total);
|
||||
}' "$1"
|
||||
}
|
||||
|
||||
imdir="$1"
|
||||
vid="$2"
|
||||
bindir=${0%/*}
|
||||
if [ "$bindir" = "$0" ]
|
||||
then
|
||||
bindir="./"
|
||||
fi
|
||||
rdir=unlvtests/reports
|
||||
|
||||
testsets="bus.3B doe3.3B mag.3B news.3B"
|
||||
#testsets="bus.3B"
|
||||
|
||||
totalerrs=0
|
||||
totalwerrs=0
|
||||
totalnswerrs=0
|
||||
totalolderrs=0
|
||||
totaloldwerrs=0
|
||||
totaloldnswerrs=0
|
||||
for set in $testsets
|
||||
do
|
||||
if [ -r "$imdir/$set/pages" ]
|
||||
then
|
||||
# Run tesseract on all the pages.
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
|
||||
# Count the errors on all the pages.
|
||||
$bindir/counttestset.sh "$imdir/$set/pages" "eng"
|
||||
# Get the old character word and nonstop word errors.
|
||||
olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum")
|
||||
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
|
||||
oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
|
||||
# Get the new character word and nonstop word errors and accuracy.
|
||||
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c10-17 |tr -d '[:blank:]')
|
||||
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c19-26 |tr -d '[:blank:]')
|
||||
# Compute the percent change.
|
||||
chdelta=$(deltapc "$cherrs" "$olderrs")
|
||||
wdelta=$(deltapc "$wderrs" "$oldwerrs")
|
||||
nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
|
||||
sumfile=$rdir/$vid.$set.sum
|
||||
if [ -r "unlvtests/results/$set.times" ]
|
||||
then
|
||||
total_time=$(timesum "unlvtests/results/$set.times")
|
||||
if [ -r "unlvtests/results/prev/$set.times" ]
|
||||
then
|
||||
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
|
||||
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
|
||||
fi
|
||||
else
|
||||
total_time='0.0'
|
||||
fi
|
||||
echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\
|
||||
$wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >"$sumfile"
|
||||
# Sum totals over all the testsets.
|
||||
let totalerrs=totalerrs+cherrs
|
||||
let totalwerrs=totalwerrs+wderrs
|
||||
let totalnswerrs=totalnswerrs+nswderrs
|
||||
let totalolderrs=totalolderrs+olderrs
|
||||
let totaloldwerrs=totaloldwerrs+oldwerrs
|
||||
let totaloldnswerrs=totaloldnswerrs+oldnswerrs
|
||||
fi
|
||||
done
|
||||
# Compute grand total percent change.
|
||||
chdelta=$(deltapc $totalerrs $totalolderrs)
|
||||
wdelta=$(deltapc $totalwerrs $totaloldwerrs)
|
||||
nswdelta=$(deltapc $totalnswerrs $totaloldnswerrs)
|
||||
tfile=$rdir/$vid.total.sum
|
||||
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
|
||||
- $wdelta% $totalnswerrs - $nswdelta%" >"$tfile"
|
||||
cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
||||
|
||||
mv "$rdir/$vid".*.sum unlvtests/results/
|
||||
cat "$rdir/$vid".summary
|
@ -1,109 +0,0 @@
|
||||
#!/bin/bash
|
||||
##############################################################################
|
||||
# File: runalltests_spa.sh
|
||||
# Description: Script to run a set of UNLV test sets for Spanish.
|
||||
# based on runalltests.sh by Ray Smith
|
||||
# Author: Shree Devi Kumar
|
||||
# Created: June 09, 2018
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
##############################################################################
|
||||
if [ $# -ne 3 ]
|
||||
then
|
||||
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d src/api ]
|
||||
then
|
||||
echo "Run $0 from the tesseract-ocr root directory!"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
|
||||
then
|
||||
echo "Please build tesseract before running $0"
|
||||
exit 1
|
||||
fi
|
||||
tessdata=$3
|
||||
lang=$4
|
||||
|
||||
#timesum computes the total cpu time
|
||||
timesum() {
|
||||
awk ' BEGIN {
|
||||
total = 0.0;
|
||||
}
|
||||
{
|
||||
total += $2;
|
||||
}
|
||||
END {
|
||||
printf("%.2f\n", total);
|
||||
}' "$1"
|
||||
}
|
||||
|
||||
imdir="$1"
|
||||
vid="$2"
|
||||
bindir=${0%/*}
|
||||
if [ "$bindir" = "$0" ]
|
||||
then
|
||||
bindir="./"
|
||||
fi
|
||||
rdir=unlvtests/reports
|
||||
|
||||
testsets="spn.3B"
|
||||
|
||||
totalerrs=0
|
||||
totalwerrs=0
|
||||
totalnswerrs=0
|
||||
for set in $testsets
|
||||
do
|
||||
if [ -r "$imdir/$set/pages" ]
|
||||
then
|
||||
# Run tesseract on all the pages.
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
|
||||
# Count the errors on all the pages.
|
||||
$bindir/counttestset.sh "$imdir/$set/pages" "spa"
|
||||
# Get the new character word and nonstop word errors and accuracy.
|
||||
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c10-17 |tr -d '[:blank:]')
|
||||
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c19-26 |tr -d '[:blank:]')
|
||||
|
||||
sumfile=$rdir/$vid.$set.sum
|
||||
if [ -r "unlvtests/results/$set.times" ]
|
||||
then
|
||||
total_time=$(timesum "unlvtests/results/$set.times")
|
||||
if [ -r "unlvtests/results/prev/$set.times" ]
|
||||
then
|
||||
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
|
||||
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
|
||||
fi
|
||||
else
|
||||
total_time='0.0'
|
||||
fi
|
||||
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
|
||||
NonStopWordErrors Accuracy TimeTaken">"$sumfile"
|
||||
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
|
||||
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
|
||||
fi
|
||||
done
|
||||
|
||||
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
||||
|
||||
mv "$rdir/$vid".*.sum unlvtests/results/
|
||||
cat "$rdir/$vid".summary
|
@ -1,80 +0,0 @@
|
||||
#!/bin/bash
|
||||
# File: runtestset.sh
|
||||
# Description: Script to run tesseract on a single UNLV set.
|
||||
# Author: Ray Smith
|
||||
# Created: Wed Jun 13 10:13:01 PDT 2007
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 3 ] && [ $# -ne 4 ]
|
||||
then
|
||||
echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d src/api ]
|
||||
then
|
||||
echo "Run $0 from the tesseract-ocr root directory!"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -r src/api/tesseract ]
|
||||
then
|
||||
if [ ! -r tesseract.exe ]
|
||||
then
|
||||
echo "Please build tesseract before running $0"
|
||||
exit 1
|
||||
else
|
||||
tess="./tesseract.exe"
|
||||
fi
|
||||
else
|
||||
tess="time -f %U -o times.txt src/api/tesseract"
|
||||
#tess="time -f %U -o times.txt tesseract"
|
||||
fi
|
||||
|
||||
tessdata=$2
|
||||
lang=$3
|
||||
pages=$1
|
||||
imdir=${pages%/pages}
|
||||
setname=${imdir##*/}
|
||||
if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
|
||||
then
|
||||
config=unlv.auto
|
||||
resdir=unlvtests/results/zoning.$setname
|
||||
else
|
||||
config=unlv
|
||||
resdir=unlvtests/results/$setname
|
||||
fi
|
||||
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
|
||||
mkdir -p "$resdir"
|
||||
rm -f "unlvtests/results/$setname.times"
|
||||
while read page dir
|
||||
do
|
||||
# A pages file may be a list of files with subdirs or maybe just
|
||||
# a plain list of files so accommodate both.
|
||||
if [ "$dir" ]
|
||||
then
|
||||
srcdir="$imdir/$dir"
|
||||
else
|
||||
srcdir="$imdir"
|
||||
fi
|
||||
# echo "$srcdir/$page.tif"
|
||||
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
|
||||
if [ -r times.txt ]
|
||||
then
|
||||
read t <times.txt
|
||||
echo "$page $t" >>"unlvtests/results/$setname.times"
|
||||
echo -e "\033M$page $t"
|
||||
if [ "$t" = "Command terminated by signal 2" ]
|
||||
then
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
done <"$pages"
|
Loading…
Reference in New Issue
Block a user