Merge pull request #1657 from Shreeshrii/master

update Spanish UNLV test, use spa.stopwords, iconv to UTF-8
2025-01-18 06:30:14 +08:00 · 2018-06-09 18:31:35 +03:00 · 2018-06-09 18:31:35 +03:00 · 37dadbe478
commit 37dadbe478
parent cd58a861d9 d8bed41ec3
9 changed files with 169 additions and 32 deletions
--- a/unlvtests/Makefile.am
+++ b/unlvtests/Makefile.am
@ -10,3 +10,6 @@ EXTRA_DIST += reports/1995.mag.3B.sum
 EXTRA_DIST += reports/1995.news.3B.sum
 EXTRA_DIST += reports/2.03.summary
 EXTRA_DIST += reports/2.04.summary
+EXTRA_DIST += reports/4_best_spa.summary
+EXTRA_DIST += reports/4_best_int_spa.summary
+EXTRA_DIST += reports/4_fast_spa.summary
--- a/unlvtests/README.md
+++ b/unlvtests/README.md
@ -34,12 +34,18 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz
 tar xzvf ~/isri-downloads/mag.3B.tar.gz
 tar xzvf ~/isri-downloads/news.3B.tar.gz
 tar xzvf ~/isri-downloads/spn.3B.tar.gz
+mkdir -p stopwords
+cd stopwords
+wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
 ```
+Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
+wordacc uses a space delimited stopwords file, not line delimited.
+s/\n/ /g

-Edit *~/ISRI-OCRtk/spn.3B/pages*
-delete the line containing the following imagename as it crashes tesseract.
+Edit ~/ISRI-OCRtk/spn.3B/pages
+Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717).

-7733_005.3B.tif
+7733_005.3B 3

 ### Step 3: Download the modified ISRI toolkit, make and install the tools :
 These will be installed in /usr/local/bin.
@ -52,10 +58,10 @@ sudo make install

 ### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.

-### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
+### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:

 ```
-unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
+unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
 ```
 and go to the gym, have lunch etc. It takes a while to run.

@ -66,5 +72,23 @@ report and comparison with the 1995 results.
 ### Step 7: run the test for Spanish.

 ```
-unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
+unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
 ```
+
+#### Notes from Nick White regarding wordacc
+
+If you just want to remove all lines which have 100% recognition,
+you can add a 'awk' command like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'  
+results.txt
+
+or if you've already got a results file you want to change, you can do this:
+
+awk '$3 != 100 {print $0}'  results.txt  newresults.txt
+
+If you only want the last sections where things are broken down by
+word, you can add a sed commend, like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^   Count   Missed %Right   $/,$ 
+!d' | awk '$3 != 100 {print $0}'  results.txt
--- a/unlvtests/counttestset.sh
+++ b/unlvtests/counttestset.sh
@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-if [ $# -ne 1 ]
+if [ $# -ne 2 ]
 then
-  echo "Usage:$0 pagesfile"
+  echo "Usage:$0 pagesfile langcode"
  exit 1
 fi
 if [ ! -d src/api ]
@ -27,6 +27,7 @@ then
 fi

 pages=$1
+langcode=$2

 imdir=${pages%/pages}
 setname=${imdir##*/}
@ -45,15 +46,22 @@ do
  fi
 #echo "$srcdir/$page.tif"
  # Count character errors.
-  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
+  iconv -f  ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
+  iconv -f  ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
+  ocrevalutf8  accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
  accfiles="$accfiles $resdir/$page.acc"
  # Count word errors.
-  ocrevalutf8  wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
+  #langcode should be either eng or spa
+  if [ "$langcode" = "eng" ]
+    then
+      ocrevalutf8  wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
+    else
+      cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
+      ocrevalutf8   wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
+  fi
  wafiles="$wafiles $resdir/$page.wa"
 done <"$pages"

-#echo "$accfiles"
-#echo "$wafiles"
-
 accsum $accfiles >"unlvtests/results/$setname.characc"
 wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
+
--- a/unlvtests/reports/1995.spn.3B.sum
+++ b/unlvtests/reports/1995.spn.3B.sum
@ -1 +0,0 @@
-1995	spn.3B	100	95.00%	0.00%	100	95.00%	0.00%	100	95.00%	0.00% WAS NOT TESTED
--- a/unlvtests/reports/4_best_int_spa.summary
+++ b/unlvtests/reports/4_best_int_spa.summary
@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
+4_best_int_spa	spn.3B	2846		99.18%		937		98.39%		739			97.54		6478.02s
--- a/unlvtests/reports/4_best_spa.summary
+++ b/unlvtests/reports/4_best_spa.summary
@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
+4_best_spa	spn.3B	2823		99.19%		924		98.41%		729			97.57		7233.76s
--- a/unlvtests/reports/4_fast_spa.summary
+++ b/unlvtests/reports/4_fast_spa.summary
@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
+4_fast_spa	spn.3B	2841		99.18%		879		98.49%		742			97.53		3838.82s
--- a/unlvtests/runalltests.sh
+++ b/unlvtests/runalltests.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 # File:        runalltests.sh
-# Description: Script to run a set of UNLV test sets.
+# Description: Script to run a set of UNLV test sets for English.
 # Author:      Ray Smith
 # Created:     Thu Jun 14 08:21:01 PDT 2007
 #
@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-if [ $# -ne 4 ]
+if [ $# -ne 3 ]
 then
-   echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
   exit 1
 fi
 if [ ! -d src/api ]
@ -31,7 +31,6 @@ then
  exit 1
 fi
 tessdata=$3
-lang=$4

 #deltapc new old calculates the %change from old to new
 deltapc() {
@ -62,19 +61,8 @@ then
 fi
 rdir=unlvtests/reports

-if [ "$lang" = "eng" ]
-then
-    testsets="bus.3B doe3.3B mag.3B news.3B"
-    #testsets="bus.3B"
-else
-    if [ "$lang" = "spa" ]
-    then
-        testsets="spn.3B"
-    else
-        echo "Language has to be eng or spa"
-        exit 1
-    fi
-fi
+testsets="bus.3B doe3.3B mag.3B news.3B"
+#testsets="bus.3B"

 totalerrs=0
 totalwerrs=0
@ -87,7 +75,7 @@ do
    if [ -r "$imdir/$set/pages" ]
    then
 	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
 	# Count the errors on all the pages.
 	$bindir/counttestset.sh "$imdir/$set/pages"
 	# Get the old character word and nonstop word errors.
--- a/unlvtests/runalltests_spa.sh
+++ b/unlvtests/runalltests_spa.sh
@ -0,0 +1,109 @@
+#!/bin/bash
+##############################################################################
+# File:        runalltests_spa.sh
+# Description: Script to run a set of UNLV test sets for Spanish.
+#                      based on runalltests.sh by Ray Smith
+# Author:      Shree Devi Kumar
+# Created:     June 09, 2018
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+if [ $# -ne 3 ]
+then
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
+   exit 1
+fi
+if [ ! -d src/api ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
+then
+  echo "Please build tesseract before running $0"
+  exit 1
+fi
+tessdata=$3
+lang=$4
+
+#timesum computes the total cpu time
+timesum() {
+awk ' BEGIN {
+total = 0.0;
+}
+{
+  total += $2;
+}
+END {
+  printf("%.2f\n", total);
+}' "$1"
+}
+
+imdir="$1"
+vid="$2"
+bindir=${0%/*}
+if [ "$bindir" = "$0" ]
+then
+    bindir="./"
+fi
+rdir=unlvtests/reports
+
+testsets="spn.3B"
+
+totalerrs=0
+totalwerrs=0
+totalnswerrs=0
+for set in $testsets
+do
+    if [ -r "$imdir/$set/pages" ]
+    then
+	# Run tesseract on all the pages.
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
+	# Count the errors on all the pages.
+	$bindir/counttestset.sh "$imdir/$set/pages" "spa"
+	# Get the new character word and nonstop word errors and accuracy.
+	cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c10-17 |tr -d '[:blank:]')
+	nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c19-26 |tr -d '[:blank:]')
+
+sumfile=$rdir/$vid.$set.sum
+        if [ -r "unlvtests/results/$set.times" ]
+        then
+          total_time=$(timesum "unlvtests/results/$set.times")
+          if [ -r "unlvtests/results/prev/$set.times" ]
+          then
+            paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
+              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
+          fi
+	else
+          total_time='0.0'
+        fi
+        echo "RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy\
+	NonStopWordErrors	Accuracy	TimeTaken">"$sumfile"
+        echo "$vid	$set	$cherrs		$chacc		$wderrs		$wdacc\
+		$nswderrs			$nswdacc		${total_time}s" >>"$sumfile"
+    fi
+done
+
+cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
+
+mv "$rdir/$vid".*.sum unlvtests/results/
+cat "$rdir/$vid".summary
				`@ -1 +0,0 @@`
				`1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED`