From 6559af0c9d23dbe5dff3e4c1b73b272a14875053 Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Sat, 9 Jun 2018 12:47:09 +0000
Subject: [PATCH 1/5] update Spanish UNLV test, use spa.stopwords, iconv to
 UTF-8

---
 unlvtests/README.md               |  30 ++++++--
 unlvtests/counttestset.sh         |  22 ++++--
 unlvtests/reports/1995.spn.3B.sum |   1 -
 unlvtests/runalltests.sh          |  24 ++-----
 unlvtests/runalltests_spa.sh      | 109 ++++++++++++++++++++++++++++++
 5 files changed, 156 insertions(+), 30 deletions(-)
 delete mode 100644 unlvtests/reports/1995.spn.3B.sum
 create mode 100755 unlvtests/runalltests_spa.sh

diff --git a/unlvtests/README.md b/unlvtests/README.md
index 98ef8c258..4522ab5bc 100644
--- a/unlvtests/README.md
+++ b/unlvtests/README.md
@@ -34,11 +34,15 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz
 tar xzvf ~/isri-downloads/mag.3B.tar.gz
 tar xzvf ~/isri-downloads/news.3B.tar.gz
 tar xzvf ~/isri-downloads/spn.3B.tar.gz
+mkdir -p stopwords
+cd stopwords
+wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
 ```
+Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
+wordacc uses a space delimited stopwords file, not line delimited.
 
 Edit *~/ISRI-OCRtk/spn.3B/pages*
 delete the line containing the following imagename as it crashes tesseract.
-
 7733_005.3B.tif
 
 ### Step 3: Download the modified ISRI toolkit, make and install the tools :
@@ -52,10 +56,10 @@ sudo make install
 
 ### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
 
-### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
+### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:
 
 ```
-unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
+unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
 ```
 and go to the gym, have lunch etc. It takes a while to run.
 
@@ -66,5 +70,23 @@ report and comparison with the 1995 results.
 ### Step 7: run the test for Spanish.
 
 ```
-unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
+unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
 ```
+
+#### Notes from Nick White regarding wordacc
+
+If you just want to remove all lines which have 100% recognition,
+you can add a 'awk' command like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'  
+results.txt
+
+or if you've already got a results file you want to change, you can do this:
+
+awk '$3 != 100 {print $0}'  results.txt  newresults.txt
+
+If you only want the last sections where things are broken down by
+word, you can add a sed commend, like this:
+
+ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^   Count   Missed %Right   $/,$ 
+!d' | awk '$3 != 100 {print $0}'  results.txt
diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh
index 560c73f7c..be380b371 100755
--- a/unlvtests/counttestset.sh
+++ b/unlvtests/counttestset.sh
@@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ $# -ne 1 ]
+if [ $# -ne 2 ]
 then
-  echo "Usage:$0 pagesfile"
+  echo "Usage:$0 pagesfile langcode"
   exit 1
 fi
 if [ ! -d src/api ]
@@ -27,6 +27,7 @@ then
 fi
 
 pages=$1
+langcode=$2
 
 imdir=${pages%/pages}
 setname=${imdir##*/}
@@ -45,15 +46,22 @@ do
   fi
 #echo "$srcdir/$page.tif"
   # Count character errors.
-  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
+  iconv -f  ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
+  iconv -f  ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
+  ocrevalutf8  accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
   accfiles="$accfiles $resdir/$page.acc"
   # Count word errors.
-  ocrevalutf8  wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
+  #langcode should be either eng or spa
+  if [ "$langcode" = "eng" ]
+    then
+      ocrevalutf8  wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
+    else
+      cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
+      ocrevalutf8   wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
+  fi
   wafiles="$wafiles $resdir/$page.wa"
 done <"$pages"
 
-#echo "$accfiles"
-#echo "$wafiles"
-
 accsum $accfiles >"unlvtests/results/$setname.characc"
 wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
+
diff --git a/unlvtests/reports/1995.spn.3B.sum b/unlvtests/reports/1995.spn.3B.sum
deleted file mode 100644
index 35060967f..000000000
--- a/unlvtests/reports/1995.spn.3B.sum
+++ /dev/null
@@ -1 +0,0 @@
-1995	spn.3B	100	95.00%	0.00%	100	95.00%	0.00%	100	95.00%	0.00% WAS NOT TESTED
diff --git a/unlvtests/runalltests.sh b/unlvtests/runalltests.sh
index 18ef3929f..5cdf5e85f 100755
--- a/unlvtests/runalltests.sh
+++ b/unlvtests/runalltests.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # File:        runalltests.sh
-# Description: Script to run a set of UNLV test sets.
+# Description: Script to run a set of UNLV test sets for English.
 # Author:      Ray Smith
 # Created:     Thu Jun 14 08:21:01 PDT 2007
 #
@@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ $# -ne 4 ]
+if [ $# -ne 3 ]
 then
-   echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
    exit 1
 fi
 if [ ! -d src/api ]
@@ -31,7 +31,6 @@ then
   exit 1
 fi
 tessdata=$3
-lang=$4
 
 #deltapc new old calculates the %change from old to new
 deltapc() {
@@ -62,19 +61,8 @@ then
 fi
 rdir=unlvtests/reports
 
-if [ "$lang" = "eng" ]
-then
-    testsets="bus.3B doe3.3B mag.3B news.3B"
-    #testsets="bus.3B"
-else
-    if [ "$lang" = "spa" ]
-    then
-        testsets="spn.3B"
-    else
-        echo "Language has to be eng or spa"
-        exit 1
-    fi
-fi
+testsets="bus.3B doe3.3B mag.3B news.3B"
+#testsets="bus.3B"
 
 totalerrs=0
 totalwerrs=0
@@ -87,7 +75,7 @@ do
     if [ -r "$imdir/$set/pages" ]
     then
 	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
 	# Count the errors on all the pages.
 	$bindir/counttestset.sh "$imdir/$set/pages"
 	# Get the old character word and nonstop word errors.
diff --git a/unlvtests/runalltests_spa.sh b/unlvtests/runalltests_spa.sh
new file mode 100755
index 000000000..a6e218bbc
--- /dev/null
+++ b/unlvtests/runalltests_spa.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+##############################################################################
+# File:        runalltests_spa.sh
+# Description: Script to run a set of UNLV test sets for Spanish.
+#                      based on runalltests.sh by Ray Smith
+# Author:      Shree Devi Kumar
+# Created:     June 09, 2018
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+if [ $# -ne 3 ]
+then
+   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
+   exit 1
+fi
+if [ ! -d src/api ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
+then
+  echo "Please build tesseract before running $0"
+  exit 1
+fi
+tessdata=$3
+lang=$4
+
+#timesum computes the total cpu time
+timesum() {
+awk ' BEGIN {
+total = 0.0;
+}
+{
+  total += $2;
+}
+END {
+  printf("%.2f\n", total);
+}' "$1"
+}
+
+imdir="$1"
+vid="$2"
+bindir=${0%/*}
+if [ "$bindir" = "$0" ]
+then
+    bindir="./"
+fi
+rdir=unlvtests/reports
+
+testsets="spn.3B"
+
+totalerrs=0
+totalwerrs=0
+totalnswerrs=0
+for set in $testsets
+do
+    if [ -r "$imdir/$set/pages" ]
+    then
+	# Run tesseract on all the pages.
+	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
+	# Count the errors on all the pages.
+	$bindir/counttestset.sh "$imdir/$set/pages" "spa"
+	# Get the new character word and nonstop word errors and accuracy.
+	cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]')
+	nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c10-17 |tr -d '[:blank:]')
+	nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
+	    cut -c19-26 |tr -d '[:blank:]')
+
+sumfile=$rdir/$vid.$set.sum
+        if [ -r "unlvtests/results/$set.times" ]
+        then
+          total_time=$(timesum "unlvtests/results/$set.times")
+          if [ -r "unlvtests/results/prev/$set.times" ]
+          then
+            paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
+              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
+          fi
+	else
+          total_time='0.0'
+        fi
+        echo "RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy\
+	NonStopWordErrors	Accuracy	TimeTaken">"$sumfile"
+        echo "$vid	$set	$cherrs		$chacc		$wderrs		$wdacc\
+		$nswderrs			$nswdacc		${total_time}s" >>"$sumfile"
+    fi
+done
+
+cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
+
+mv "$rdir/$vid".*.sum unlvtests/results/
+cat "$rdir/$vid".summary

From 86700fd345c1164aefa2761286358c2d7c6e503f Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Sat, 9 Jun 2018 13:07:21 +0000
Subject: [PATCH 2/5] add summary for Spanish UNLV test with 4.0.0-beta with
 --tessdata_fast

---
 unlvtests/Makefile.am                | 1 +
 unlvtests/reports/4_fast_spa.summary | 2 ++
 2 files changed, 3 insertions(+)
 create mode 100644 unlvtests/reports/4_fast_spa.summary

diff --git a/unlvtests/Makefile.am b/unlvtests/Makefile.am
index bf5731df8..8fc2eb764 100644
--- a/unlvtests/Makefile.am
+++ b/unlvtests/Makefile.am
@@ -10,3 +10,4 @@ EXTRA_DIST += reports/1995.mag.3B.sum
 EXTRA_DIST += reports/1995.news.3B.sum
 EXTRA_DIST += reports/2.03.summary
 EXTRA_DIST += reports/2.04.summary
+EXTRA_DIST += reports/4_fast_spa.summary
diff --git a/unlvtests/reports/4_fast_spa.summary b/unlvtests/reports/4_fast_spa.summary
new file mode 100644
index 000000000..6d25fe333
--- /dev/null
+++ b/unlvtests/reports/4_fast_spa.summary
@@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
+4_fast_spa	spn.3B	2841		99.18%		879		98.49%		742			97.53		3838.82s

From 4290951fc19fc9c3ccc120f696b156780bed351a Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Sat, 9 Jun 2018 14:36:10 +0000
Subject: [PATCH 3/5] add summary for Spanish UNLV test with 4.0.0-beta with
 --tessdata_best and --tessdata

---
 unlvtests/Makefile.am                    | 2 ++
 unlvtests/reports/4_best_int_spa.summary | 2 ++
 unlvtests/reports/4_best_spa.summary     | 2 ++
 3 files changed, 6 insertions(+)
 create mode 100644 unlvtests/reports/4_best_int_spa.summary
 create mode 100644 unlvtests/reports/4_best_spa.summary

diff --git a/unlvtests/Makefile.am b/unlvtests/Makefile.am
index 8fc2eb764..23790c713 100644
--- a/unlvtests/Makefile.am
+++ b/unlvtests/Makefile.am
@@ -10,4 +10,6 @@ EXTRA_DIST += reports/1995.mag.3B.sum
 EXTRA_DIST += reports/1995.news.3B.sum
 EXTRA_DIST += reports/2.03.summary
 EXTRA_DIST += reports/2.04.summary
+EXTRA_DIST += reports/4_best_spa.summary
+EXTRA_DIST += reports/4_best_int_spa.summary
 EXTRA_DIST += reports/4_fast_spa.summary
diff --git a/unlvtests/reports/4_best_int_spa.summary b/unlvtests/reports/4_best_int_spa.summary
new file mode 100644
index 000000000..cbb92073a
--- /dev/null
+++ b/unlvtests/reports/4_best_int_spa.summary
@@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
+4_best_int_spa	spn.3B	2846		99.18%		937		98.39%		739			97.54		6478.02s
diff --git a/unlvtests/reports/4_best_spa.summary b/unlvtests/reports/4_best_spa.summary
new file mode 100644
index 000000000..69a7b75d8
--- /dev/null
+++ b/unlvtests/reports/4_best_spa.summary
@@ -0,0 +1,2 @@
+RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
+4_best_spa	spn.3B	2823		99.19%		924		98.41%		729			97.57		7233.76s

From a01d1604c301cac64f0e243c413846abf3553f77 Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Sat, 9 Jun 2018 14:44:54 +0000
Subject: [PATCH 4/5] update readme

---
 unlvtests/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/unlvtests/README.md b/unlvtests/README.md
index 4522ab5bc..d98df97fc 100644
--- a/unlvtests/README.md
+++ b/unlvtests/README.md
@@ -40,10 +40,12 @@ wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwo
 ```
 Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
 wordacc uses a space delimited stopwords file, not line delimited.
+s/\n/ /g
 
-Edit *~/ISRI-OCRtk/spn.3B/pages*
-delete the line containing the following imagename as it crashes tesseract.
-7733_005.3B.tif
+Edit ~/ISRI-OCRtk/spn.3B/pages
+Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717).
+
+7733_005.3B 3
 
 ### Step 3: Download the modified ISRI toolkit, make and install the tools :
 These will be installed in /usr/local/bin.

From d8bed41ec3a55788e4044d26dfa62c8c57086627 Mon Sep 17 00:00:00 2001
From: Shreeshrii <shreeshrii@gmail.com>
Date: Sat, 9 Jun 2018 20:17:51 +0530
Subject: [PATCH 5/5] change filename to generic ~/

---
 unlvtests/counttestset.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh
index be380b371..7e3d7b5f2 100755
--- a/unlvtests/counttestset.sh
+++ b/unlvtests/counttestset.sh
@@ -56,7 +56,7 @@ do
     then
       ocrevalutf8  wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
     else
-      cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
+      cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
       ocrevalutf8   wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
   fi
   wafiles="$wafiles $resdir/$page.wa"