Merge pull request #1650 from Shreeshrii/master

Add Spanish UNLV tests, use ocreval tools from /usr/local/bin
2024-12-12 07:29:07 +08:00 · 2018-06-09 10:59:50 +03:00 · 2018-06-09 10:59:50 +03:00 · 3f725dd92e
commit 3f725dd92e
parent e7c1e0739c 477f57adf2
5 changed files with 90 additions and 46 deletions
--- a/unlvtests/README.md
+++ b/unlvtests/README.md
@ -1,45 +1,70 @@
-How to run UNLV tests.
+## How to run UNLV tests.

 The scripts in this directory make it possible to duplicate the tests
 published in the Fourth Annual Test of OCR Accuracy.
-See http://www.isri.unlv.edu/downloads/AT-1995.pdf
+See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf
 but first you have to get the tools and data used by  UNLV:

-Step 1: to download the images goto
+### Step 1: to download the images go to
 https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/ 
 and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
+spn.3B.tar.gz is incorrect in this repo, so get it from code.google

+```
 mkdir -p ~/isri-downloads
 cd ~/isri-downloads
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
 curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
+curl  -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
+```

-Step 2: extract the files. It doesn't really matter where
+### Step 2: extract the files. 
+It doesn't really matter where
 in your filesystem you put them, but they must go under a common
 root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example,
 ~/ISRI-OCRtk.

+```
 mkdir -p ~/ISRI-OCRtk
 cd ~/ISRI-OCRtk
 tar xzvf ~/isri-downloads/bus.3B.tar.gz
 tar xzvf ~/isri-downloads/doe3.3B.tar.gz
 tar xzvf ~/isri-downloads/mag.3B.tar.gz
 tar xzvf ~/isri-downloads/news.3B.tar.gz
+tar xzvf ~/isri-downloads/spn.3B.tar.gz
+```

-Step 4: Download the modified ISRI toolkit from:
-https://ancientgreekocr.org/ocr-evaluation-tools.git
+Edit *~/ISRI-OCRtk/spn.3B/pages*
+delete the line containing the following imagename as it crashes tesseract.

-make and install the tools in unlvtests/ocreval/bin by
-`make PREFIX=~/tesseract/unlvtests/ocreval install`
+7733_005.3B.tif

-Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
+### Step 3: Download the modified ISRI toolkit, make and install the tools :
+These will be installed in /usr/local/bin.

-Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname:
-unlvtests/runalltests.sh ~/ISRI-OCRtk tess4.0.0-beta.1
-and go to the gym, have lunch etc.
+```
+git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
+cd ~/ocr-evaluation-tools
+sudo make install
+```

-Step 8: There should be a file
-unlvtests/reports/tess4.0.0-beta.1.summary that contains the final summarized accuracy
+### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
+
+### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
+
+```
+unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
+```
+and go to the gym, have lunch etc. It takes a while to run.
+
+### Step 6: There should be a RELEASE.summary file
+*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
 report and comparison with the 1995 results.
+
+### Step 7: run the test for Spanish.
+
+```
+unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
+```
--- a/unlvtests/counttestset.sh
+++ b/unlvtests/counttestset.sh
@ -43,17 +43,17 @@ do
  else
     srcdir="$imdir"
  fi
-echo "$srcdir/$page.tif"
+#echo "$srcdir/$page.tif"
  # Count character errors.
-  unlvtests/ocreval/bin/ocrevalutf8  unlvtests/ocreval/bin/accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.acc"
+  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
  accfiles="$accfiles $resdir/$page.acc"
  # Count word errors.
-    unlvtests/ocreval/bin/ocrevalutf8  unlvtests/ocreval/bin/wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.wa"
+  ocrevalutf8  wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
  wafiles="$wafiles $resdir/$page.wa"
 done <"$pages"

-echo "$accfiles"
-echo "$wafiles"
+#echo "$accfiles"
+#echo "$wafiles"

-  unlvtests/ocreval/bin/accsum $accfiles >"unlvtests/reports/$setname.characc"
-  unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordaccsum $wafiles >"unlvtests/reports/$setname.wordacc"
+accsum $accfiles >"unlvtests/results/$setname.characc"
+wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
--- a/unlvtests/reports/1995.spn.3B.sum
+++ b/unlvtests/reports/1995.spn.3B.sum
@ -0,0 +1 @@
+1995	spn.3B	100	95.00%	0.00%	100	95.00%	0.00%	100	95.00%	0.00% WAS NOT TESTED
--- a/unlvtests/runalltests.sh
+++ b/unlvtests/runalltests.sh
@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-if [ $# -ne 2 ]
+if [ $# -ne 4 ]
 then
-   echo "Usage:$0 unlv-data-dir version-id"
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
   exit 1
 fi
 if [ ! -d src/api ]
@ -30,7 +30,8 @@ then
  echo "Please build tesseract before running $0"
  exit 1
 fi
-
+tessdata=$3
+lang=$4

 #deltapc new old calculates the %change from old to new
 deltapc() {
@ -60,8 +61,20 @@ then
    bindir="./"
 fi
 rdir=unlvtests/reports
-#testsets="bus.3B doe3.3B mag.3B news.3B"
-testsets="bus.3B"
+
+if [ "$lang" = "eng" ]
+then
+    testsets="bus.3B doe3.3B mag.3B news.3B"
+    #testsets="bus.3B"
+else
+    if [ "$lang" = "spa" ]
+    then
+        testsets="spn.3B"
+    else
+        echo "Language has to be eng or spa"
+        exit 1
+    fi
+fi

 totalerrs=0
 totalwerrs=0
@ -74,7 +87,7 @@ do
    if [ -r "$imdir/$set/pages" ]
    then
 	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages"
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
 	# Count the errors on all the pages.
 	$bindir/counttestset.sh "$imdir/$set/pages"
 	# Get the old character word and nonstop word errors.
@ -82,30 +95,30 @@ do
 	oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
 	oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
 	# Get the new character word and nonstop word errors and accuracy.
-	cherrs=$(head -4 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
+	cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	chacc=$(head -5 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
+	chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	wderrs=$(head -4 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
+	wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	wdacc=$(head -5 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
+	wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
-	nswderrs=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
+	nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
 	    cut -c10-17 |tr -d '[:blank:]')
-	nswdacc=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
+	nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
 	    cut -c19-26 |tr -d '[:blank:]')
 	# Compute the percent change.
 	chdelta=$(deltapc "$cherrs" "$olderrs")
 	wdelta=$(deltapc "$wderrs" "$oldwerrs")
 	nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
 	sumfile=$rdir/$vid.$set.sum
-        if [ -r "unlvtests/reports/$set.times" ]
+        if [ -r "unlvtests/results/$set.times" ]
        then
-          total_time=$(timesum "unlvtests/reports/$set.times")
-          if [ -r "unlvtests/reports/prev/$set.times" ]
+          total_time=$(timesum "unlvtests/results/$set.times")
+          if [ -r "unlvtests/results/prev/$set.times" ]
          then
-            paste "unlvtests/reports/prev/$set.times" "unlvtests/reports/$set.times" |
-              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/reports/$set.timedelta"
+            paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
+              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
          fi
 	else
          total_time='0.0'
@ -129,3 +142,6 @@ tfile=$rdir/$vid.total.sum
 echo "$vid	Total	$totalerrs	-	$chdelta%	$totalwerrs\
 	-	$wdelta%	$totalnswerrs	-	$nswdelta%" >"$tfile"
 cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
+
+mv "$rdir/$vid".*.sum unlvtests/results/
+cat "$rdir/$vid".summary
--- a/unlvtests/runtestset.sh
+++ b/unlvtests/runtestset.sh
@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-if [ $# -ne 1 ] && [ $# -ne 2 ]
+if  [ $# -ne 3 ] && [ $# -ne 4 ]
 then
-  echo "Usage:$0 pagesfile [-zoning]"
+  echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
  exit 1
 fi
 if [ ! -d src/api ]
@ -36,13 +36,15 @@ then
  fi
 else
  tess="time -f %U -o times.txt src/api/tesseract"
-  export TESSDATA_PREFIX=$PWD/
+  #tess="time -f %U -o times.txt tesseract"
 fi

+tessdata=$2
+lang=$3
 pages=$1
 imdir=${pages%/pages}
 setname=${imdir##*/}
-if [ $# -eq 2 ] && [ "$2" = "-zoning" ]
+if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
 then
  config=unlv.auto
  resdir=unlvtests/results/zoning.$setname
@ -52,7 +54,7 @@ else
 fi
 echo -e "Testing on set $setname in directory $imdir to $resdir\n"
 mkdir -p "$resdir"
-rm -f "unlvtests/reports/$setname.times"
+rm -f "unlvtests/results/$setname.times"
 while read page dir
 do
  # A pages file may be a list of files with subdirs or maybe just
@ -64,11 +66,11 @@ do
     srcdir="$imdir"
  fi
 #  echo "$srcdir/$page.tif"
-  $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir ../tessdata_fast --oem 1 -l eng --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
+  $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
  if [ -r times.txt ]
  then
    read t <times.txt
-    echo "$page $t" >>"unlvtests/reports/$setname.times"
+    echo "$page $t" >>"unlvtests/results/$setname.times"
    echo -e "\033M$page $t"
    if [ "$t" = "Command terminated by signal 2" ]
    then
				`@ -0,0 +1 @@`
				`1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED`