From ea7f4801edba328352cafa812d541f379439d864 Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Fri, 8 Jun 2018 14:28:50 +0000 Subject: [PATCH 1/3] add option for UNLV tests for spa --- unlvtests/README.md | 26 +++++++++++----- unlvtests/counttestset.sh | 16 +++++----- unlvtests/reports/1995.spn.3B.sum | 1 + unlvtests/runalltests.sh | 50 ++++++++++++++++++++----------- unlvtests/runtestset.sh | 16 +++++----- 5 files changed, 69 insertions(+), 40 deletions(-) create mode 100644 unlvtests/reports/1995.spn.3B.sum diff --git a/unlvtests/README.md b/unlvtests/README.md index 89c699db..ae3a2385 100644 --- a/unlvtests/README.md +++ b/unlvtests/README.md @@ -5,9 +5,10 @@ published in the Fourth Annual Test of OCR Accuracy. See http://www.isri.unlv.edu/downloads/AT-1995.pdf but first you have to get the tools and data used by UNLV: -Step 1: to download the images goto +Step 1: to download the images go to https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/ and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz +spn.3B.tar.gz is incorrect in this repo, so get it from code.google mkdir -p ~/isri-downloads cd ~/isri-downloads @@ -15,6 +16,7 @@ curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bu curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz +curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz Step 2: extract the files. It doesn't really matter where in your filesystem you put them, but they must go under a common @@ -27,19 +29,27 @@ tar xzvf ~/isri-downloads/bus.3B.tar.gz tar xzvf ~/isri-downloads/doe3.3B.tar.gz tar xzvf ~/isri-downloads/mag.3B.tar.gz tar xzvf ~/isri-downloads/news.3B.tar.gz +tar xzvf ~/isri-downloads/spn.3B.tar.gz -Step 4: Download the modified ISRI toolkit from: -https://ancientgreekocr.org/ocr-evaluation-tools.git +**** Edit ~/ISRI-OCRtk/spn.3B/pages +delete the line containing the following imagename as it crashes tesseract. +7733_005.3B.tif -make and install the tools in unlvtests/ocreval/bin by -`make PREFIX=~/tesseract/unlvtests/ocreval install` +Step 4: Download the modified ISRI toolkit and make and install the tools : + +git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git +cd ~/ocr-evaluation-tools +sudo make install Step 6: cd back to your main tesseract-ocr dir and Build tesseract. -Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname: -unlvtests/runalltests.sh ~/ISRI-OCRtk tess4.0.0-beta.1 +Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname, tessdata-dir and language: + +unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng and go to the gym, have lunch etc. Step 8: There should be a file -unlvtests/reports/tess4.0.0-beta.1.summary that contains the final summarized accuracy +unlvtests/reports/4-beta_fast.summary that contains the final summarized accuracy report and comparison with the 1995 results. + +unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh index 3c86860d..560c73f7 100755 --- a/unlvtests/counttestset.sh +++ b/unlvtests/counttestset.sh @@ -43,17 +43,17 @@ do else srcdir="$imdir" fi -echo "$srcdir/$page.tif" +#echo "$srcdir/$page.tif" # Count character errors. - unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.acc" - accfiles="$accfiles $resdir/$page.acc" + ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc" + accfiles="$accfiles $resdir/$page.acc" # Count word errors. - unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.wa" + ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa" wafiles="$wafiles $resdir/$page.wa" done <"$pages" -echo "$accfiles" -echo "$wafiles" +#echo "$accfiles" +#echo "$wafiles" - unlvtests/ocreval/bin/accsum $accfiles >"unlvtests/reports/$setname.characc" - unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordaccsum $wafiles >"unlvtests/reports/$setname.wordacc" +accsum $accfiles >"unlvtests/results/$setname.characc" +wordaccsum $wafiles >"unlvtests/results/$setname.wordacc" diff --git a/unlvtests/reports/1995.spn.3B.sum b/unlvtests/reports/1995.spn.3B.sum new file mode 100644 index 00000000..35060967 --- /dev/null +++ b/unlvtests/reports/1995.spn.3B.sum @@ -0,0 +1 @@ +1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED diff --git a/unlvtests/runalltests.sh b/unlvtests/runalltests.sh index 8511964a..18ef3929 100755 --- a/unlvtests/runalltests.sh +++ b/unlvtests/runalltests.sh @@ -15,9 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [ $# -ne 2 ] +if [ $# -ne 4 ] then - echo "Usage:$0 unlv-data-dir version-id" + echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang " exit 1 fi if [ ! -d src/api ] @@ -30,7 +30,8 @@ then echo "Please build tesseract before running $0" exit 1 fi - +tessdata=$3 +lang=$4 #deltapc new old calculates the %change from old to new deltapc() { @@ -60,8 +61,20 @@ then bindir="./" fi rdir=unlvtests/reports -#testsets="bus.3B doe3.3B mag.3B news.3B" -testsets="bus.3B" + +if [ "$lang" = "eng" ] +then + testsets="bus.3B doe3.3B mag.3B news.3B" + #testsets="bus.3B" +else + if [ "$lang" = "spa" ] + then + testsets="spn.3B" + else + echo "Language has to be eng or spa" + exit 1 + fi +fi totalerrs=0 totalwerrs=0 @@ -74,7 +87,7 @@ do if [ -r "$imdir/$set/pages" ] then # Run tesseract on all the pages. - $bindir/runtestset.sh "$imdir/$set/pages" + $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" # Count the errors on all the pages. $bindir/counttestset.sh "$imdir/$set/pages" # Get the old character word and nonstop word errors. @@ -82,30 +95,30 @@ do oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum") oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum") # Get the new character word and nonstop word errors and accuracy. - cherrs=$(head -4 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 | + cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | tr -d '[:blank:]') - chacc=$(head -5 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 | + chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | tr -d '[:blank:]') - wderrs=$(head -4 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 | + wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | tr -d '[:blank:]') - wdacc=$(head -5 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 | + wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | tr -d '[:blank:]') - nswderrs=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 | + nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | cut -c10-17 |tr -d '[:blank:]') - nswdacc=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 | + nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | cut -c19-26 |tr -d '[:blank:]') # Compute the percent change. chdelta=$(deltapc "$cherrs" "$olderrs") wdelta=$(deltapc "$wderrs" "$oldwerrs") nswdelta=$(deltapc "$nswderrs" "$oldnswerrs") sumfile=$rdir/$vid.$set.sum - if [ -r "unlvtests/reports/$set.times" ] + if [ -r "unlvtests/results/$set.times" ] then - total_time=$(timesum "unlvtests/reports/$set.times") - if [ -r "unlvtests/reports/prev/$set.times" ] + total_time=$(timesum "unlvtests/results/$set.times") + if [ -r "unlvtests/results/prev/$set.times" ] then - paste "unlvtests/reports/prev/$set.times" "unlvtests/reports/$set.times" | - awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/reports/$set.timedelta" + paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" | + awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta" fi else total_time='0.0' @@ -129,3 +142,6 @@ tfile=$rdir/$vid.total.sum echo "$vid Total $totalerrs - $chdelta% $totalwerrs\ - $wdelta% $totalnswerrs - $nswdelta%" >"$tfile" cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary + +mv "$rdir/$vid".*.sum unlvtests/results/ +cat "$rdir/$vid".summary diff --git a/unlvtests/runtestset.sh b/unlvtests/runtestset.sh index 608199da..783f0bfb 100755 --- a/unlvtests/runtestset.sh +++ b/unlvtests/runtestset.sh @@ -15,9 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [ $# -ne 1 ] && [ $# -ne 2 ] +if [ $# -ne 3 ] && [ $# -ne 4 ] then - echo "Usage:$0 pagesfile [-zoning]" + echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]" exit 1 fi if [ ! -d src/api ] @@ -36,13 +36,15 @@ then fi else tess="time -f %U -o times.txt src/api/tesseract" - export TESSDATA_PREFIX=$PWD/ + #tess="time -f %U -o times.txt tesseract" fi +tessdata=$2 +lang=$3 pages=$1 imdir=${pages%/pages} setname=${imdir##*/} -if [ $# -eq 2 ] && [ "$2" = "-zoning" ] +if [ $# -eq 4 ] && [ "$4" = "-zoning" ] then config=unlv.auto resdir=unlvtests/results/zoning.$setname @@ -52,7 +54,7 @@ else fi echo -e "Testing on set $setname in directory $imdir to $resdir\n" mkdir -p "$resdir" -rm -f "unlvtests/reports/$setname.times" +rm -f "unlvtests/results/$setname.times" while read page dir do # A pages file may be a list of files with subdirs or maybe just @@ -64,11 +66,11 @@ do srcdir="$imdir" fi # echo "$srcdir/$page.tif" - $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir ../tessdata_fast --oem 1 -l eng --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" + $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" if [ -r times.txt ] then read t >"unlvtests/reports/$setname.times" + echo "$page $t" >>"unlvtests/results/$setname.times" echo -e "\033M$page $t" if [ "$t" = "Command terminated by signal 2" ] then From 7acebd0f52f1d1e2ad3f49920d03e5b3cddf6ac2 Mon Sep 17 00:00:00 2001 From: Shreeshrii Date: Fri, 8 Jun 2018 21:38:16 +0530 Subject: [PATCH 2/3] reformat with markdown --- unlvtests/README.md | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/unlvtests/README.md b/unlvtests/README.md index ae3a2385..c8874703 100644 --- a/unlvtests/README.md +++ b/unlvtests/README.md @@ -1,15 +1,16 @@ -How to run UNLV tests. +## How to run UNLV tests. The scripts in this directory make it possible to duplicate the tests published in the Fourth Annual Test of OCR Accuracy. See http://www.isri.unlv.edu/downloads/AT-1995.pdf but first you have to get the tools and data used by UNLV: -Step 1: to download the images go to +### Step 1: to download the images go to https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/ and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz spn.3B.tar.gz is incorrect in this repo, so get it from code.google +``` mkdir -p ~/isri-downloads cd ~/isri-downloads curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz @@ -17,12 +18,15 @@ curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/do curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz +``` -Step 2: extract the files. It doesn't really matter where +### Step 2: extract the files. +It doesn't really matter where in your filesystem you put them, but they must go under a common root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example, ~/ISRI-OCRtk. +``` mkdir -p ~/ISRI-OCRtk cd ~/ISRI-OCRtk tar xzvf ~/isri-downloads/bus.3B.tar.gz @@ -30,26 +34,37 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz tar xzvf ~/isri-downloads/mag.3B.tar.gz tar xzvf ~/isri-downloads/news.3B.tar.gz tar xzvf ~/isri-downloads/spn.3B.tar.gz +``` -**** Edit ~/ISRI-OCRtk/spn.3B/pages +Edit *~/ISRI-OCRtk/spn.3B/pages* delete the line containing the following imagename as it crashes tesseract. + 7733_005.3B.tif -Step 4: Download the modified ISRI toolkit and make and install the tools : +### Step 3: Download the modified ISRI toolkit, make and install the tools : +These will be installed in /usr/local/bin. +``` git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git cd ~/ocr-evaluation-tools sudo make install +``` -Step 6: cd back to your main tesseract-ocr dir and Build tesseract. +### Step 4: cd back to your main tesseract-ocr dir and Build tesseract. -Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname, tessdata-dir and language: +### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language: +``` unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng -and go to the gym, have lunch etc. +``` +and go to the gym, have lunch etc. It takes a while to run. -Step 8: There should be a file -unlvtests/reports/4-beta_fast.summary that contains the final summarized accuracy +### Step 6: There should be a RELEASE.summary file +*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy report and comparison with the 1995 results. +### Step 7: run the test for Spanish. + +``` unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa +``` From 477f57adf22e0168c45196838fb4066fbc22bb15 Mon Sep 17 00:00:00 2001 From: Shreeshrii Date: Fri, 8 Jun 2018 21:52:25 +0530 Subject: [PATCH 3/3] correct URL for 1995 report --- unlvtests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unlvtests/README.md b/unlvtests/README.md index c8874703..98ef8c25 100644 --- a/unlvtests/README.md +++ b/unlvtests/README.md @@ -2,7 +2,7 @@ The scripts in this directory make it possible to duplicate the tests published in the Fourth Annual Test of OCR Accuracy. -See http://www.isri.unlv.edu/downloads/AT-1995.pdf +See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf but first you have to get the tools and data used by UNLV: ### Step 1: to download the images go to