From 6559af0c9d23dbe5dff3e4c1b73b272a14875053 Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Sat, 9 Jun 2018 12:47:09 +0000 Subject: [PATCH 1/5] update Spanish UNLV test, use spa.stopwords, iconv to UTF-8 --- unlvtests/README.md | 30 ++++++-- unlvtests/counttestset.sh | 22 ++++-- unlvtests/reports/1995.spn.3B.sum | 1 - unlvtests/runalltests.sh | 24 ++----- unlvtests/runalltests_spa.sh | 109 ++++++++++++++++++++++++++++++ 5 files changed, 156 insertions(+), 30 deletions(-) delete mode 100644 unlvtests/reports/1995.spn.3B.sum create mode 100755 unlvtests/runalltests_spa.sh diff --git a/unlvtests/README.md b/unlvtests/README.md index 98ef8c258..4522ab5bc 100644 --- a/unlvtests/README.md +++ b/unlvtests/README.md @@ -34,11 +34,15 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz tar xzvf ~/isri-downloads/mag.3B.tar.gz tar xzvf ~/isri-downloads/news.3B.tar.gz tar xzvf ~/isri-downloads/spn.3B.tar.gz +mkdir -p stopwords +cd stopwords +wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt ``` +Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt +wordacc uses a space delimited stopwords file, not line delimited. Edit *~/ISRI-OCRtk/spn.3B/pages* delete the line containing the following imagename as it crashes tesseract. - 7733_005.3B.tif ### Step 3: Download the modified ISRI toolkit, make and install the tools : @@ -52,10 +56,10 @@ sudo make install ### Step 4: cd back to your main tesseract-ocr dir and Build tesseract. -### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language: +### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir: ``` -unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng +unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast ``` and go to the gym, have lunch etc. It takes a while to run. @@ -66,5 +70,23 @@ report and comparison with the 1995 results. ### Step 7: run the test for Spanish. ``` -unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa +unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast ``` + +#### Notes from Nick White regarding wordacc + +If you just want to remove all lines which have 100% recognition, +you can add a 'awk' command like this: + +ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}' +results.txt + +or if you've already got a results file you want to change, you can do this: + +awk '$3 != 100 {print $0}' results.txt newresults.txt + +If you only want the last sections where things are broken down by +word, you can add a sed commend, like this: + +ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$ +!d' | awk '$3 != 100 {print $0}' results.txt diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh index 560c73f7c..be380b371 100755 --- a/unlvtests/counttestset.sh +++ b/unlvtests/counttestset.sh @@ -15,9 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [ $# -ne 1 ] +if [ $# -ne 2 ] then - echo "Usage:$0 pagesfile" + echo "Usage:$0 pagesfile langcode" exit 1 fi if [ ! -d src/api ] @@ -27,6 +27,7 @@ then fi pages=$1 +langcode=$2 imdir=${pages%/pages} setname=${imdir##*/} @@ -45,15 +46,22 @@ do fi #echo "$srcdir/$page.tif" # Count character errors. - ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc" + iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text" + iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text" + ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc" accfiles="$accfiles $resdir/$page.acc" # Count word errors. - ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa" + #langcode should be either eng or spa + if [ "$langcode" = "eng" ] + then + ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" + else + cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords" + ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" + fi wafiles="$wafiles $resdir/$page.wa" done <"$pages" -#echo "$accfiles" -#echo "$wafiles" - accsum $accfiles >"unlvtests/results/$setname.characc" wordaccsum $wafiles >"unlvtests/results/$setname.wordacc" + diff --git a/unlvtests/reports/1995.spn.3B.sum b/unlvtests/reports/1995.spn.3B.sum deleted file mode 100644 index 35060967f..000000000 --- a/unlvtests/reports/1995.spn.3B.sum +++ /dev/null @@ -1 +0,0 @@ -1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED diff --git a/unlvtests/runalltests.sh b/unlvtests/runalltests.sh index 18ef3929f..5cdf5e85f 100755 --- a/unlvtests/runalltests.sh +++ b/unlvtests/runalltests.sh @@ -1,6 +1,6 @@ #!/bin/bash # File: runalltests.sh -# Description: Script to run a set of UNLV test sets. +# Description: Script to run a set of UNLV test sets for English. # Author: Ray Smith # Created: Thu Jun 14 08:21:01 PDT 2007 # @@ -15,9 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [ $# -ne 4 ] +if [ $# -ne 3 ] then - echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang " + echo "Usage:$0 unlv-data-dir version-id tessdata-dir" exit 1 fi if [ ! -d src/api ] @@ -31,7 +31,6 @@ then exit 1 fi tessdata=$3 -lang=$4 #deltapc new old calculates the %change from old to new deltapc() { @@ -62,19 +61,8 @@ then fi rdir=unlvtests/reports -if [ "$lang" = "eng" ] -then - testsets="bus.3B doe3.3B mag.3B news.3B" - #testsets="bus.3B" -else - if [ "$lang" = "spa" ] - then - testsets="spn.3B" - else - echo "Language has to be eng or spa" - exit 1 - fi -fi +testsets="bus.3B doe3.3B mag.3B news.3B" +#testsets="bus.3B" totalerrs=0 totalwerrs=0 @@ -87,7 +75,7 @@ do if [ -r "$imdir/$set/pages" ] then # Run tesseract on all the pages. - $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" + $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng" # Count the errors on all the pages. $bindir/counttestset.sh "$imdir/$set/pages" # Get the old character word and nonstop word errors. diff --git a/unlvtests/runalltests_spa.sh b/unlvtests/runalltests_spa.sh new file mode 100755 index 000000000..a6e218bbc --- /dev/null +++ b/unlvtests/runalltests_spa.sh @@ -0,0 +1,109 @@ +#!/bin/bash +############################################################################## +# File: runalltests_spa.sh +# Description: Script to run a set of UNLV test sets for Spanish. +# based on runalltests.sh by Ray Smith +# Author: Shree Devi Kumar +# Created: June 09, 2018 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +if [ $# -ne 3 ] +then + echo "Usage:$0 unlv-data-dir version-id tessdata-dir" + exit 1 +fi +if [ ! -d src/api ] +then + echo "Run $0 from the tesseract-ocr root directory!" + exit 1 +fi +if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ] +then + echo "Please build tesseract before running $0" + exit 1 +fi +tessdata=$3 +lang=$4 + +#timesum computes the total cpu time +timesum() { +awk ' BEGIN { +total = 0.0; +} +{ + total += $2; +} +END { + printf("%.2f\n", total); +}' "$1" +} + +imdir="$1" +vid="$2" +bindir=${0%/*} +if [ "$bindir" = "$0" ] +then + bindir="./" +fi +rdir=unlvtests/reports + +testsets="spn.3B" + +totalerrs=0 +totalwerrs=0 +totalnswerrs=0 +for set in $testsets +do + if [ -r "$imdir/$set/pages" ] + then + # Run tesseract on all the pages. + $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa" + # Count the errors on all the pages. + $bindir/counttestset.sh "$imdir/$set/pages" "spa" + # Get the new character word and nonstop word errors and accuracy. + cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | + cut -c10-17 |tr -d '[:blank:]') + nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | + cut -c19-26 |tr -d '[:blank:]') + +sumfile=$rdir/$vid.$set.sum + if [ -r "unlvtests/results/$set.times" ] + then + total_time=$(timesum "unlvtests/results/$set.times") + if [ -r "unlvtests/results/prev/$set.times" ] + then + paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" | + awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta" + fi + else + total_time='0.0' + fi + echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\ + NonStopWordErrors Accuracy TimeTaken">"$sumfile" + echo "$vid $set $cherrs $chacc $wderrs $wdacc\ + $nswderrs $nswdacc ${total_time}s" >>"$sumfile" + fi +done + +cat "$rdir/$vid".*.sum >"$rdir/$vid".summary + +mv "$rdir/$vid".*.sum unlvtests/results/ +cat "$rdir/$vid".summary From 86700fd345c1164aefa2761286358c2d7c6e503f Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Sat, 9 Jun 2018 13:07:21 +0000 Subject: [PATCH 2/5] add summary for Spanish UNLV test with 4.0.0-beta with --tessdata_fast --- unlvtests/Makefile.am | 1 + unlvtests/reports/4_fast_spa.summary | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 unlvtests/reports/4_fast_spa.summary diff --git a/unlvtests/Makefile.am b/unlvtests/Makefile.am index bf5731df8..8fc2eb764 100644 --- a/unlvtests/Makefile.am +++ b/unlvtests/Makefile.am @@ -10,3 +10,4 @@ EXTRA_DIST += reports/1995.mag.3B.sum EXTRA_DIST += reports/1995.news.3B.sum EXTRA_DIST += reports/2.03.summary EXTRA_DIST += reports/2.04.summary +EXTRA_DIST += reports/4_fast_spa.summary diff --git a/unlvtests/reports/4_fast_spa.summary b/unlvtests/reports/4_fast_spa.summary new file mode 100644 index 000000000..6d25fe333 --- /dev/null +++ b/unlvtests/reports/4_fast_spa.summary @@ -0,0 +1,2 @@ +RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken +4_fast_spa spn.3B 2841 99.18% 879 98.49% 742 97.53 3838.82s From 4290951fc19fc9c3ccc120f696b156780bed351a Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Sat, 9 Jun 2018 14:36:10 +0000 Subject: [PATCH 3/5] add summary for Spanish UNLV test with 4.0.0-beta with --tessdata_best and --tessdata --- unlvtests/Makefile.am | 2 ++ unlvtests/reports/4_best_int_spa.summary | 2 ++ unlvtests/reports/4_best_spa.summary | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 unlvtests/reports/4_best_int_spa.summary create mode 100644 unlvtests/reports/4_best_spa.summary diff --git a/unlvtests/Makefile.am b/unlvtests/Makefile.am index 8fc2eb764..23790c713 100644 --- a/unlvtests/Makefile.am +++ b/unlvtests/Makefile.am @@ -10,4 +10,6 @@ EXTRA_DIST += reports/1995.mag.3B.sum EXTRA_DIST += reports/1995.news.3B.sum EXTRA_DIST += reports/2.03.summary EXTRA_DIST += reports/2.04.summary +EXTRA_DIST += reports/4_best_spa.summary +EXTRA_DIST += reports/4_best_int_spa.summary EXTRA_DIST += reports/4_fast_spa.summary diff --git a/unlvtests/reports/4_best_int_spa.summary b/unlvtests/reports/4_best_int_spa.summary new file mode 100644 index 000000000..cbb92073a --- /dev/null +++ b/unlvtests/reports/4_best_int_spa.summary @@ -0,0 +1,2 @@ +RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken +4_best_int_spa spn.3B 2846 99.18% 937 98.39% 739 97.54 6478.02s diff --git a/unlvtests/reports/4_best_spa.summary b/unlvtests/reports/4_best_spa.summary new file mode 100644 index 000000000..69a7b75d8 --- /dev/null +++ b/unlvtests/reports/4_best_spa.summary @@ -0,0 +1,2 @@ +RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken +4_best_spa spn.3B 2823 99.19% 924 98.41% 729 97.57 7233.76s From a01d1604c301cac64f0e243c413846abf3553f77 Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Sat, 9 Jun 2018 14:44:54 +0000 Subject: [PATCH 4/5] update readme --- unlvtests/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unlvtests/README.md b/unlvtests/README.md index 4522ab5bc..d98df97fc 100644 --- a/unlvtests/README.md +++ b/unlvtests/README.md @@ -40,10 +40,12 @@ wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwo ``` Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt wordacc uses a space delimited stopwords file, not line delimited. +s/\n/ /g -Edit *~/ISRI-OCRtk/spn.3B/pages* -delete the line containing the following imagename as it crashes tesseract. -7733_005.3B.tif +Edit ~/ISRI-OCRtk/spn.3B/pages +Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717). + +7733_005.3B 3 ### Step 3: Download the modified ISRI toolkit, make and install the tools : These will be installed in /usr/local/bin. From d8bed41ec3a55788e4044d26dfa62c8c57086627 Mon Sep 17 00:00:00 2001 From: Shreeshrii Date: Sat, 9 Jun 2018 20:17:51 +0530 Subject: [PATCH 5/5] change filename to generic ~/ --- unlvtests/counttestset.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh index be380b371..7e3d7b5f2 100755 --- a/unlvtests/counttestset.sh +++ b/unlvtests/counttestset.sh @@ -56,7 +56,7 @@ do then ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" else - cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords" + cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords" ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" fi wafiles="$wafiles $resdir/$page.wa"