move langtests and unlvtests from tesseract-ocr repository to test repository

2024-11-24 02:59:07 +08:00 · 2018-11-08 22:31:32 +01:00 · 2018-11-08 22:31:32 +01:00 · cdfb768010
commit cdfb768010
parent cbef2ebe12
43 changed files with 1 additions and 1034 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest

 EXTRA_DIST = README.md LICENSE
 EXTRA_DIST += aclocal.m4 config configure.ac autogen.sh
-EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
+EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc
 EXTRA_DIST += CMakeLists.txt tesseract.pc.cmake cmake VERSION src/vs2010 cppan.yml

 DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)
--- a/configure.ac
+++ b/configure.ac
@ -463,7 +463,6 @@ fi

 # Output files
 AC_CONFIG_FILES([Makefile tesseract.pc])
-AC_CONFIG_FILES([langtests/Makefile])
 AC_CONFIG_FILES([src/api/Makefile])
 AC_CONFIG_FILES([src/api/tess_version.h])
 AC_CONFIG_FILES([src/arch/Makefile])
@ -481,7 +480,6 @@ AC_CONFIG_FILES([src/wordrec/Makefile])
 AC_CONFIG_FILES([tessdata/Makefile])
 AC_CONFIG_FILES([tessdata/configs/Makefile])
 AC_CONFIG_FILES([tessdata/tessconfigs/Makefile])
-AC_CONFIG_FILES([unlvtests/Makefile])
 AC_CONFIG_FILES([unittest/Makefile])
 AC_CONFIG_FILES([java/Makefile])
 AC_CONFIG_FILES([java/com/Makefile])
--- a/langtests/.gitignore
+++ b/langtests/.gitignore
@ -1,2 +0,0 @@
-#
-results/*
--- a/langtests/Makefile.am
+++ b/langtests/Makefile.am
@ -1,8 +0,0 @@
-
-EXTRA_DIST = README.md
-EXTRA_DIST += frk_setup.sh
-EXTRA_DIST += frk_test.sh
-EXTRA_DIST += counttestset.sh
-EXTRA_DIST += runlangtests.sh
-EXTRA_DIST += runtestset.sh
-EXTRA_DIST += reports/*
--- a/langtests/README.md
+++ b/langtests/README.md
@ -1,54 +0,0 @@
-# Language tests.
-The scripts in this directory make it possible to test Accuracy of Tesseract for different languages. 
-## Setup
-### Step 1: If not already installed, download the modified ISRI toolkit, 
-make and install the tools in /usr/local/bin.
-```
-git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
-cd ~/ocr-evaluation-tools
-sudo make install
-```
-### Step 2: If not alrady built, Build tesseract.
-Use binaries from the tesseract/src/api and tesseract/src/training directory.
-### Step 3
-Download images and corresponding ground truth  text for the language to be tested.
-Each testset should have only one kind of images (eg. tif, png, jpg etc).
-The ground truth text files should have the same base filename with txt extension.
-As needed, modify the filenames and create the `pages` file for each testset.
-Instructions for testing Fraktur and Sanskrit languages are given below as an example.
-## Testing for Fraktur - frk and script/Fraktur
-### Download the images and groundtruth, modify to required format.
-```
-bash -x frk_setup.sh
-```
-### Run tests for Fraktur - frk and script/Fraktur
-```
-bash -x frk_test.sh
-```
-## Testing for Sanskrit - san and script/Devanagari
-### Download the images and groundtruth, modify to required format.
-```
-bash -x deva_setup.sh
-```
-### Run tests 
-```
-bash -x deva_test.sh
-```
-
-### Notes from Nick White regarding wordacc
-
-If you just want to remove all lines which have 100% recognition,
-you can add a 'awk' command like this:
-
-ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'  
-results.txt
-
-or if you've already got a results file you want to change, you can do this:
-
-awk '$3 != 100 {print $0}'  results.txt  newresults.txt
-
-If you only want the last sections where things are broken down by
-word, you can add a sed commend, like this:
-
-ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^   Count   Missed %Right   $/,$ 
-!d' | awk '$3 != 100 {print $0}'  results.txt
--- a/langtests/counttestset.sh
+++ b/langtests/counttestset.sh
@ -1,52 +0,0 @@
-#!/bin/bash
-# File:        counttestset.sh
-# Description: Script to count the errors on a single UNLV set.
-# Author:      Ray Smith
-# Created:     Wed Jun 13 11:58:01 PDT 2007
-#
-# (C) Copyright 2007, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ $# -ne 2 ]
-then
-  echo "Usage:$0 pagesfile langcode"
-  exit 1
-fi
-
-pages=$1
-langcode=$2
-
-imdir=${pages%/pages}
-setname=${imdir##*/}
-resdir=langtests/results/$setname
-mkdir -p langtests/reports
-echo "Counting on set $setname in directory $imdir to $resdir"
-accfiles=""
-wafiles=""
-while read page dir
-do
-  if [ "$dir" ]
-  then
-     srcdir="$imdir/$dir"
-  else
-     srcdir="$imdir"
-  fi
-  echo "$srcdir/$page"
-  # Count character errors.
-  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
-  accfiles="$accfiles $resdir/$page.acc"
-  # Count word errors.
-  ocrevalutf8   wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
-  wafiles="$wafiles $resdir/$page.wa"
-done <"$pages"
-
-accsum $accfiles >"langtests/results/$setname.characc"
-wordaccsum $wafiles >"langtests/results/$setname.wordacc"
--- a/langtests/deva_setup.sh
+++ b/langtests/deva_setup.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-#
-mkdir -p ~/lang-files
-rm -rf  ~/lang-files/san-*
-for testset in vedic fontsamples oldstyle shreelipi alphabetsamples
-do
-	cd ~/lang-files
-	mkdir -p ./san-$testset
-	cp ~/lang-deva-downloads/imagessan/$testset/*.* ./san-$testset/
-	cd ./san-$testset/
-	rename s/-gt.txt/.txt/ *.txt
-	ls -1 *.png >pages
-	sed -i -e 's/.png//g' pages
-done
-
-mkdir -p ~/lang-stopwords
-cd ~/lang-stopwords
-cp ~/lang-deva-downloads/imagessan/stopwords.txt ./san.stopwords.txt 
--- a/langtests/deva_test.sh
+++ b/langtests/deva_test.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-# run langtests/runlangtests.sh with the root data dir, testname, tessdata-dir, language code and image extension
-
-cd ~/tesseract
-
-langtests/runlangtests.sh ~/lang-files 4_fast_Devanagari ../tessdata_fast/script Devanagari png
-langtests/runlangtests.sh ~/lang-files 4_best_int_Devanagari ../tessdata/script Devanagari png
-langtests/runlangtests.sh ~/lang-files 4_best_Devanagari ../tessdata_best/script Devanagari png
-langtests/runlangtests.sh ~/lang-files 4_fast_san ../tessdata_fast san png
-langtests/runlangtests.sh ~/lang-files 4_best_int_san ../tessdata san png
-langtests/runlangtests.sh ~/lang-files 4_best_san ../tessdata_best san png
-
-langtests/runlangtests.sh ~/lang-files    4_plus40k_san    ../tesstutorial-deva    san    png
-
-#/home/ubuntu/tesstutorial-deva/san.traineddata at n iterations
-
-### It takes a while to run.
-
--- a/langtests/frk_setup.sh
+++ b/langtests/frk_setup.sh
@ -1,24 +0,0 @@
-#!/bin/bash
-#
-mkdir -p ~/lang-downloads
-cd ~/lang-downloads
-wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
-wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
-
-mkdir -p ~/lang-files
-cd ~/lang-files
-unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
-unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
-mkdir -p ./frk-ligatures
-cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
-cp ./frk/gt/*.txt ./frk-ligatures/
-
-cd ./frk-ligatures/
-ls -1 *.tif >pages
-sed -i -e 's/.tif//g' pages
-
-mkdir -p ~/lang-stopwords
-cd ~/lang-stopwords
-wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
-
-echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."
--- a/langtests/frk_test.sh
+++ b/langtests/frk_test.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-#
-# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
-
-cd ~/tesseract
-langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur tif
-
-langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk tif
-langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk tif
-langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk tif
-
-### It takes a while to run.
-
--- a/langtests/reports/4_best_Devanagari.summary
+++ b/langtests/reports/4_best_Devanagari.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_Devanagari	san-alphabetsamples	2013		56.17%		1323		12.27%		1323			12.27		606.28s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_Devanagari	san-fontsamples	388		94.82%		87		86.38%		87			86.38		570.17s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_Devanagari	san-oldstyle	2796		59.93%		523		39.61%		523			39.61		447.73s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_Devanagari	san-shreelipi	830		94.01%		311		81.40%		311			81.40		1137.51s
--- a/langtests/reports/4_best_frk.summary
+++ b/langtests/reports/4_best_frk.summary
@ -1,2 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_frk	frk-ligatures	178		94.73%		100		81.31%		74			75.17		94.29s
--- a/langtests/reports/4_best_int_Devanagari.summary
+++ b/langtests/reports/4_best_int_Devanagari.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_Devanagari	san-alphabetsamples	2010		56.24%		1321		12.40%		1321			12.40		556.26s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_Devanagari	san-fontsamples	396		94.72%		89		86.07%		89			86.07		524.07s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_Devanagari	san-oldstyle	2812		59.70%		523		39.61%		523			39.61		416.57s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_Devanagari	san-shreelipi	829		94.01%		314		81.22%		314			81.22		1087.02s
--- a/langtests/reports/4_best_int_frk.summary
+++ b/langtests/reports/4_best_int_frk.summary
@ -1,2 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_frk	frk-ligatures	244		92.78%		109		79.63%		80			73.15		367.73s
--- a/langtests/reports/4_best_int_san.summary
+++ b/langtests/reports/4_best_int_san.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_san	san-alphabetsamples	2342		49.01%		1353		10.28%		1353			10.28		281.60s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_san	san-fontsamples	474		93.68%		126		80.28%		126			80.28		281.05s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_san	san-oldstyle	3121		55.27%		602		30.48%		602			30.48		206.20s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_int_san	san-shreelipi	1163		91.60%		417		75.06%		417			75.06		606.80s
--- a/langtests/reports/4_best_san.summary
+++ b/langtests/reports/4_best_san.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_san	san-alphabetsamples	2335		49.16%		1348		10.61%		1348			10.61		300.24s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_san	san-fontsamples	473		93.69%		126		80.28%		126			80.28		267.05s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_san	san-oldstyle	3121		55.27%		598		30.95%		598			30.95		205.28s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_best_san	san-shreelipi	1168		91.56%		414		75.24%		414			75.24		610.52s
--- a/langtests/reports/4_fast_Devanagari.summary
+++ b/langtests/reports/4_fast_Devanagari.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_Devanagari	san-alphabetsamples	2017		56.09%		1317		12.67%		1317			12.67		400.38s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_Devanagari	san-fontsamples	433		94.22%		108		83.10%		108			83.10		287.48s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_Devanagari	san-oldstyle	2883		58.68%		543		37.30%		543			37.30		289.85s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_Devanagari	san-shreelipi	750		94.58%		279		83.31%		279			83.31		813.19s
--- a/langtests/reports/4_fast_Fraktur.summary
+++ b/langtests/reports/4_fast_Fraktur.summary
@ -1,2 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_Fraktur	frk-ligatures	265		92.16%		116		78.32%		82			72.48		91.29s
--- a/langtests/reports/4_fast_frk.summary
+++ b/langtests/reports/4_fast_frk.summary
@ -1,2 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_frk	frk-ligatures	244		92.78%		109		79.63%		80			73.15		89.98s
--- a/langtests/reports/4_fast_san.summary
+++ b/langtests/reports/4_fast_san.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_san	san-alphabetsamples	2342		49.01%		1353		10.28%		1353			10.28		276.73s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_san	san-fontsamples	474		93.68%		126		80.28%		126			80.28		278.34s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_san	san-oldstyle	3121		55.27%		602		30.48%		602			30.48		222.35s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_fast_san	san-shreelipi	1163		91.60%		417		75.06%		417			75.06		626.40s
--- a/langtests/reports/4_plus10k_san.summary
+++ b/langtests/reports/4_plus10k_san.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus10k_san	san-alphabetsamples	1725		62.44%		1112		26.26%		1112			26.26		160.48s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus10k_san	san-fontsamples	349		95.34%		73		88.58%		73			88.58		138.09s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus10k_san	san-oldstyle	2818		59.62%		548		36.72%		548			36.72		120.83s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus10k_san	san-shreelipi	746		94.61%		279		83.31%		279			83.31		292.70s
--- a/langtests/reports/4_plus20k_san.summary
+++ b/langtests/reports/4_plus20k_san.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus20k_san	san-alphabetsamples	1441		68.63%		841		44.23%		841			44.23		156.57s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus20k_san	san-fontsamples	356		95.25%		75		88.26%		75			88.26		135.13s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus20k_san	san-oldstyle	2862		58.99%		555		35.91%		555			35.91		118.21s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus20k_san	san-shreelipi	726		94.76%		267		84.03%		267			84.03		295.68s
--- a/langtests/reports/4_plus30k_san.summary
+++ b/langtests/reports/4_plus30k_san.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus30k_san	san-alphabetsamples	1656		63.95%		937		37.86%		937			37.86		615.62s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus30k_san	san-fontsamples	429		94.28%		89		86.07%		89			86.07		617.42s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus30k_san	san-oldstyle	2885		58.66%		561		35.22%		561			35.22		432.58s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus30k_san	san-shreelipi	447		96.77%		123		92.64%		123			92.64		1081.29s
--- a/langtests/reports/4_plus40k_san.summary
+++ b/langtests/reports/4_plus40k_san.summary
@ -1,8 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus40k_san	san-alphabetsamples	1380		69.95%		775		48.61%		775			48.61		1198.16s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus40k_san	san-fontsamples	401		94.65%		79		87.64%		79			87.64		1275.08s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus40k_san	san-oldstyle	2860		59.01%		534		38.34%		534			38.34		977.65s
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
-4_plus40k_san	san-shreelipi	441		96.81%		113		93.24%		113			93.24		2301.53s
--- a/langtests/runlangtests.sh
+++ b/langtests/runlangtests.sh
@ -1,105 +0,0 @@
-#!/bin/bash
-##############################################################################
-# File:        runlangtests.sh
-# Description: Script to run a set of accuracy test sets for any language.
-#                      based on runalltests.sh by Ray Smith
-# Author:      Shree Devi Kumar
-# Created:     June 09, 2018
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-if [ $# -ne 5 ]
-then
-   echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode imgext"
-   exit 1
-fi
-
-tessdata=$3
-lang=$4
-imgext=$5
-
-#timesum computes the total cpu time
-timesum() {
-awk ' BEGIN {
-total = 0.0;
-}
-{
-  total += $2;
-}
-END {
-  printf("%.2f\n", total);
-}' "$1"
-}
-
-imdir="$1"
-vid="$2"
-bindir=${0%/*}
-if [ "$bindir" = "$0" ]
-then
-    bindir="./"
-fi
-rdir=langtests/reports
-if [ "$lang" = "frk" ] ||  [ "$lang" = "Fraktur" ]
-    then
-       testsets="frk-ligatures"
-fi
-if [ "$lang" = "san" ] ||  [ "$lang" = "Devanagari" ]
-    then
-       testsets="san-fontsamples san-oldstyle san-shreelipi san-alphabetsamples"
-	   ### testsets="san-fontsamples" 
-fi
-
-totalerrs=0
-totalwerrs=0
-totalnswerrs=0
-for set in $testsets
-do
-    resdir=langtests/results/$set
-    mkdir -p "$resdir"
-    cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
-    if [ -r "$imdir/$set/pages" ]
-    then
-	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" "$imgext"
-	# Count the errors on all the pages.
-	$bindir/counttestset.sh "$imdir/$set/pages" $lang
-	# Get the new character word and nonstop word errors and accuracy.
-	cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
-	    cut -c10-17 |tr -d '[:blank:]')
-	nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
-	    cut -c19-26 |tr -d '[:blank:]')
-
-    sumfile=$rdir/$vid.$set.sum
-        if [ -r "langtests/results/$set.times" ]
-        then
-          total_time=$(timesum "langtests/results/$set.times")
-      	else
-          total_time='0.0'
-        fi
-        echo "RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy\
-	NonStopWErrors	Accuracy	TimeTaken">"$sumfile"
-        echo "$vid	$set	$cherrs		$chacc		$wderrs		$wdacc\
-		$nswderrs			$nswdacc		${total_time}s" >>"$sumfile"
-    fi
-done
-
-cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
-
-mv "$rdir/$vid".*.sum langtests/results/
-cat "$rdir/$vid".summary
--- a/langtests/runtestset.sh
+++ b/langtests/runtestset.sh
@ -1,61 +0,0 @@
-#!/bin/bash
-# File:        runtestset.sh
-# Description: Script to run tesseract on a single UNLV set.
-# Author:      Ray Smith
-# Created:     Wed Jun 13 10:13:01 PDT 2007
-#
-# (C) Copyright 2007, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if  [ $# -ne 4 ] 
-then
-  echo "Usage:$0 pagesfile tessdata-dir langcode imgext"
-  exit 1
-fi
-
-tess="time -f %U -o times.txt ./src/api/tesseract"
-
-tessdata=$2
-langcode=$3
-imgext=$4
-pages=$1
-imdir=${pages%/pages}
-setname=${imdir##*/}
-
-config=""
-resdir=langtests/results/$setname
-
-echo -e "Testing on set $setname in directory $imdir to $resdir\n"
-mkdir -p "$resdir"
-rm -f "langtests/results/$setname.times"
-while read page dir
-do
-  # A pages file may be a list of files with subdirs or maybe just
-  # a plain list of files so accommodate both.
-  if [ "$dir" ]
-  then
-     srcdir="$imdir/$dir"
-  else
-     srcdir="$imdir"
-  fi
-  echo "$srcdir/$page"
-  $tess "$srcdir/$page.$imgext" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
-  if [ -r times.txt ]
-  then
-    read t <times.txt
-    echo "$page $t" >>"langtests/results/$setname.times"
-    echo -e "\033M$page $t"
-    if [ "$t" = "Command terminated by signal 2" ]
-    then
-      exit 0
-    fi
-  fi
-done <"$pages"
--- a/unlvtests/Makefile.am
+++ b/unlvtests/Makefile.am
@ -1,12 +0,0 @@
-
-EXTRA_DIST = README.md
-EXTRA_DIST += counttestset.sh
-EXTRA_DIST += runalltests.sh
-EXTRA_DIST += runalltests_spa.sh
-EXTRA_DIST += runtestset.sh
-EXTRA_DIST += reports/1995.bus.3B.sum
-EXTRA_DIST += reports/1995.doe3.3B.sum
-EXTRA_DIST += reports/1995.mag.3B.sum
-EXTRA_DIST += reports/1995.news.3B.sum
-EXTRA_DIST += reports/2.03.summary
-EXTRA_DIST += reports/2.04.summary
--- a/unlvtests/README.md
+++ b/unlvtests/README.md
@ -1,94 +0,0 @@
-## How to run UNLV tests.
-
-The scripts in this directory make it possible to duplicate the tests
-published in the Fourth Annual Test of OCR Accuracy.
-See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf
-but first you have to get the tools and data used by  UNLV:
-
-### Step 1: to download the images go to
-https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/
-and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
-spn.3B.tar.gz is incorrect in this repo, so get it from code.google
-
-```
-mkdir -p ~/isri-downloads
-cd ~/isri-downloads
-curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
-curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
-curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
-curl  -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
-curl  -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
-```
-
-### Step 2: extract the files.
-It doesn't really matter where
-in your filesystem you put them, but they must go under a common
-root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example,
-~/ISRI-OCRtk.
-
-```
-mkdir -p ~/ISRI-OCRtk
-cd ~/ISRI-OCRtk
-tar xzvf ~/isri-downloads/bus.3B.tar.gz
-tar xzvf ~/isri-downloads/doe3.3B.tar.gz
-tar xzvf ~/isri-downloads/mag.3B.tar.gz
-tar xzvf ~/isri-downloads/news.3B.tar.gz
-tar xzvf ~/isri-downloads/spn.3B.tar.gz
-mkdir -p stopwords
-cd stopwords
-wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
-```
-Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
-wordacc uses a space delimited stopwords file, not line delimited.
-s/\n/ /g
-
-Edit ~/ISRI-OCRtk/spn.3B/pages
-Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717).
-
-7733_005.3B 3
-
-### Step 3: Download the modified ISRI toolkit, make and install the tools :
-These will be installed in /usr/local/bin.
-
-```
-git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
-cd ~/ocr-evaluation-tools
-sudo make install
-```
-
-### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
-
-### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:
-
-```
-unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
-```
-and go to the gym, have lunch etc. It takes a while to run.
-
-### Step 6: There should be a RELEASE.summary file
-*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
-report and comparison with the 1995 results.
-
-### Step 7: run the test for Spanish.
-
-```
-unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
-```
-
-#### Notes from Nick White regarding wordacc
-
-If you just want to remove all lines which have 100% recognition,
-you can add a 'awk' command like this:
-
-ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
-results.txt
-
-or if you've already got a results file you want to change, you can do this:
-
-awk '$3 != 100 {print $0}'  results.txt  newresults.txt
-
-If you only want the last sections where things are broken down by
-word, you can add a sed command, like this:
-
-ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^   Count   Missed %Right   $/,$
-!d' | awk '$3 != 100 {print $0}'  results.txt
--- a/unlvtests/counttestset.sh
+++ b/unlvtests/counttestset.sh
@ -1,68 +0,0 @@
-#!/bin/bash
-# File:        counttestset.sh
-# Description: Script to count the errors on a single UNLV set.
-# Author:      Ray Smith
-# Created:     Wed Jun 13 11:58:01 PDT 2007
-#
-# (C) Copyright 2007, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ $# -ne 2 ]
-then
-  echo "Usage:$0 pagesfile langcode"
-  exit 1
-fi
-if [ ! -d src/api ]
-then
-  echo "Run $0 from the tesseract-ocr root directory!"
-  exit 1
-fi
-
-pages=$1
-langcode=$2
-
-imdir=${pages%/pages}
-setname=${imdir##*/}
-resdir=unlvtests/results/$setname
-mkdir -p unlvtests/reports
-echo "Counting on set $setname in directory $imdir to $resdir"
-accfiles=""
-wafiles=""
-while read page dir
-do
-  if [ "$dir" ]
-  then
-     srcdir="$imdir/$dir"
-  else
-     srcdir="$imdir"
-  fi
-#echo "$srcdir/$page.tif"
-  # Convert groundtruth and recognized text to UTF-8 to correctly treat accented letters.
-  iconv -f  ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
-  iconv -f  ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
-  # Count character errors.
-  ocrevalutf8  accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
-  accfiles="$accfiles $resdir/$page.acc"
-  # Count word errors.
-  #langcode should be either eng or spa
-  if [ "$langcode" = "eng" ]
-    then
-      ocrevalutf8  wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
-    else
-      cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
-      ocrevalutf8   wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
-  fi
-  wafiles="$wafiles $resdir/$page.wa"
-done <"$pages"
-
-accsum $accfiles >"unlvtests/results/$setname.characc"
-wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
-
--- a/unlvtests/reorgdata.sh
+++ b/unlvtests/reorgdata.sh
@ -1,53 +0,0 @@
-#!/bin/bash
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ $# -ne 1 ]
-then
-    echo "Usage:$0 scantype"
-    echo "UNLV data comes in several scan types:"
-    echo "3B=300 dpi binary"
-    echo "3A=adaptive thresholded 300 dpi"
-    echo "3G=300 dpi grey"
-    echo "4B=400dpi binary"
-    echo "2B=200dpi binary"
-    echo "For now we only use 3B"
-    exit 1
-fi
-ext=$1
-
-#There are several test sets without meaningful names, so rename
-#them with something a bit more meaningful.
-#Each s is oldname/newname
-for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
-do
-    old=${s%/*}
-    #if this set was downloaded then process it.
-    if [ -r "$old/PAGES" ]
-    then
-	new=${s#*/}.$ext
-	mkdir -p "$new"
-    	echo "Set $old -> $new"
-	#The pages file had - instead of _ so fix it and add the extension.
-	for page in $(cat $old/PAGES)
-	do
-    	    echo "${page%-*}_${page#*-}.$ext"
-	done >"$new/pages"
-	for f in $(cat "$new/pages")
-	do
-    	    #Put a tif extension on the tif files.
-	    cp "$old/${old}_B/$f" "$new/$f.tif"
-	    #Put a uzn extension on the zone files.
-	    cp "$old/${old}_B/${f}Z" "$new/$f.uzn"
-	    #Cat all the truth files together and put into a single txt file.
-	    cat "$old/${old}_GT/${f%.$ext}".Z* >"$new/$f.txt"
-	done
-    fi
-done
--- a/unlvtests/reports/1995.bus.3B.sum
+++ b/unlvtests/reports/1995.bus.3B.sum
@ -1 +0,0 @@
-1995	bus.3B	5959	98.14%	0.00%	1631	96.83%	0.00%	1293	95.73%	0.00%
--- a/unlvtests/reports/1995.doe3.3B.sum
+++ b/unlvtests/reports/1995.doe3.3B.sum
@ -1 +0,0 @@
-1995	doe3.3B	36349	97.52%	0.00%	7826	96.34%	0.00%	7042	94.87%	0.00%
--- a/unlvtests/reports/1995.mag.3B.sum
+++ b/unlvtests/reports/1995.mag.3B.sum
@ -1 +0,0 @@
-1995	mag.3B	15043	97.74%	0.00%	4566	96.01%	0.00%	3379	94.99%	0.00%
--- a/unlvtests/reports/1995.news.3B.sum
+++ b/unlvtests/reports/1995.news.3B.sum
@ -1 +0,0 @@
-1995	news.3B	6432	98.69%	0.00%	1946	97.68%	0.00%	1502	96.94%	0.00%
--- a/unlvtests/reports/2.03.summary
+++ b/unlvtests/reports/2.03.summary
@ -1,9 +0,0 @@
-1995	bus.3B	5959	98.14%	0.00%	1631	96.83%	0.00%	1293	95.73%	0.00%
-1995	doe3.3B	36349	97.52%	0.00%	7826	96.34%	0.00%	7042	94.87%	0.00%
-1995	mag.3B	15043	97.74%	0.00%	4566	96.01%	0.00%	3379	94.99%	0.00%
-1995	news.3B	6432	98.69%	0.00%	1946	97.68%	0.00%	1502	96.94%	0.00%
-2.03	bus.3B	6422	97.99%	7.77%	1750	96.60%	7.30%	1361	95.51	5.26%
-2.03	doe3.3B	29520	97.98%	-18.79%	7966	96.27%	1.79%	6764	95.07	-3.95%
-2.03	mag.3B	14568	97.81%	-3.16%	4288	96.25%	-6.09%	3054	95.47	-9.62%
-2.03	news.3B	7655	98.44%	19.01%	1730	97.94%	-11.10%	1208	97.54	-19.57%
-2.03	Total	58165	-	-8.81%	15734	-	-1.47%	12387	-	-6.27%
--- a/unlvtests/reports/2.04.summary
+++ b/unlvtests/reports/2.04.summary
@ -1,9 +0,0 @@
-1995	bus.3B	5959	98.14%	0.00%	1631	96.83%	0.00%	1293	95.73%	0.00%
-1995	doe3.3B	36349	97.52%	0.00%	7826	96.34%	0.00%	7042	94.87%	0.00%
-1995	mag.3B	15043	97.74%	0.00%	4566	96.01%	0.00%	3379	94.99%	0.00%
-1995	news.3B	6432	98.69%	0.00%	1946	97.68%	0.00%	1502	96.94%	0.00%
-2.04	bus.3B	6422	97.99%	7.77%	1750	96.60%	7.30%	1361	95.51	5.26%
-2.04	doe3.3B	29514	97.98%	-18.80%	7963	96.27%	1.75%	6762	95.07	-3.98%
-2.04	mag.3B	14568	97.81%	-3.16%	4289	96.25%	-6.07%	3053	95.47	-9.65%
-2.04	news.3B	7655	98.44%	19.01%	1730	97.94%	-11.10%	1208	97.54	-19.57%
-2.04	Total	58159	-	-8.82%	15732	-	-1.48%	12384	-	-6.30%
--- a/unlvtests/reports/4_best_int_spa.summary
+++ b/unlvtests/reports/4_best_int_spa.summary
@ -1,2 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
-4_best_int_spa	spn.3B	2846		99.18%		937		98.39%		739			97.54		6478.02s
--- a/unlvtests/reports/4_best_spa.summary
+++ b/unlvtests/reports/4_best_spa.summary
@ -1,2 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
-4_best_spa	spn.3B	2823		99.19%		924		98.41%		729			97.57		7233.76s
--- a/unlvtests/reports/4_fast_eng.summary
+++ b/unlvtests/reports/4_fast_eng.summary
@ -1,9 +0,0 @@
-1995	bus.3B	5959	98.14%	0.00%	1631	96.83%	0.00%	1293	95.73%	0.00%
-1995	doe3.3B	36349	97.52%	0.00%	7826	96.34%	0.00%	7042	94.87%	0.00%
-1995	mag.3B	15043	97.74%	0.00%	4566	96.01%	0.00%	3379	94.99%	0.00%
-1995	news.3B	6432	98.69%	0.00%	1946	97.68%	0.00%	1502	96.94%	0.00%
-4_fast_eng	bus.3B	6124	98.11%	2.77%	1138	97.88%	-30.23%	963	97.05	-25.52%	3935.26s
-4_fast_eng	doe3.3B	30029	97.96%	-17.39%	13781	94.45%	76.09%	13178	92.38	87.13%	18847.36s
-4_fast_eng	mag.3B	10934	98.37%	-27.32%	3343	97.15%	-26.78%	2813	96.06	-16.75%	6867.14s
-4_fast_eng	news.3B	5734	98.84%	-10.85%	1322	98.45%	-32.07%	1040	97.94	-30.76%	5527.38s
-4_fast_eng	Total	52821	-	-17.19%	19584	-	22.64%	17994	-	36.15%
--- a/unlvtests/reports/4_fast_spa.summary
+++ b/unlvtests/reports/4_fast_spa.summary
@ -1,2 +0,0 @@
-RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWordErrors	Accuracy	TimeTaken
-4_fast_spa	spn.3B	2841		99.18%		879		98.49%		742			97.53		3838.82s
--- a/unlvtests/runalltests.sh
+++ b/unlvtests/runalltests.sh
@ -1,135 +0,0 @@
-#!/bin/bash
-# File:        runalltests.sh
-# Description: Script to run a set of UNLV test sets for English.
-# Author:      Ray Smith
-# Created:     Thu Jun 14 08:21:01 PDT 2007
-#
-# (C) Copyright 2007, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ $# -ne 3 ]
-then
-   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
-   exit 1
-fi
-if [ ! -d src/api ]
-then
-  echo "Run $0 from the tesseract-ocr root directory!"
-  exit 1
-fi
-if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
-then
-  echo "Please build tesseract before running $0"
-  exit 1
-fi
-tessdata=$3
-
-#deltapc new old calculates the %change from old to new
-deltapc() {
-awk ' BEGIN {
-printf("%.2f", 100.0*('"$1"'-'"$2"')/'"$2"');
-}'
-}
-
-#timesum computes the total cpu time
-timesum() {
-awk ' BEGIN {
-total = 0.0;
-}
-{
-  total += $2;
-}
-END {
-  printf("%.2f\n", total);
-}' "$1"
-}
-
-imdir="$1"
-vid="$2"
-bindir=${0%/*}
-if [ "$bindir" = "$0" ]
-then
-    bindir="./"
-fi
-rdir=unlvtests/reports
-
-testsets="bus.3B doe3.3B mag.3B news.3B"
-#testsets="bus.3B"
-
-totalerrs=0
-totalwerrs=0
-totalnswerrs=0
-totalolderrs=0
-totaloldwerrs=0
-totaloldnswerrs=0
-for set in $testsets
-do
-    if [ -r "$imdir/$set/pages" ]
-    then
-	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
-	# Count the errors on all the pages.
-	$bindir/counttestset.sh "$imdir/$set/pages" "eng"
-	# Get the old character word and nonstop word errors.
-	olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum")
-	oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
-	oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
-	# Get the new character word and nonstop word errors and accuracy.
-	cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
-	    cut -c10-17 |tr -d '[:blank:]')
-	nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
-	    cut -c19-26 |tr -d '[:blank:]')
-	# Compute the percent change.
-	chdelta=$(deltapc "$cherrs" "$olderrs")
-	wdelta=$(deltapc "$wderrs" "$oldwerrs")
-	nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
-	sumfile=$rdir/$vid.$set.sum
-        if [ -r "unlvtests/results/$set.times" ]
-        then
-          total_time=$(timesum "unlvtests/results/$set.times")
-          if [ -r "unlvtests/results/prev/$set.times" ]
-          then
-            paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
-              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
-          fi
-	else
-          total_time='0.0'
-        fi
-        echo "$vid	$set	$cherrs	$chacc	$chdelta%	$wderrs	$wdacc\
-	$wdelta%	$nswderrs	$nswdacc	$nswdelta%	${total_time}s" >"$sumfile"
-	# Sum totals over all the testsets.
-	let totalerrs=totalerrs+cherrs
-	let totalwerrs=totalwerrs+wderrs
-	let totalnswerrs=totalnswerrs+nswderrs
-	let totalolderrs=totalolderrs+olderrs
-	let totaloldwerrs=totaloldwerrs+oldwerrs
-	let totaloldnswerrs=totaloldnswerrs+oldnswerrs
-    fi
-done
-# Compute grand total percent change.
-chdelta=$(deltapc $totalerrs $totalolderrs)
-wdelta=$(deltapc $totalwerrs $totaloldwerrs)
-nswdelta=$(deltapc $totalnswerrs $totaloldnswerrs)
-tfile=$rdir/$vid.total.sum
-echo "$vid	Total	$totalerrs	-	$chdelta%	$totalwerrs\
-	-	$wdelta%	$totalnswerrs	-	$nswdelta%" >"$tfile"
-cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
-
-mv "$rdir/$vid".*.sum unlvtests/results/
-cat "$rdir/$vid".summary
--- a/unlvtests/runalltests_spa.sh
+++ b/unlvtests/runalltests_spa.sh
@ -1,109 +0,0 @@
-#!/bin/bash
-##############################################################################
-# File:        runalltests_spa.sh
-# Description: Script to run a set of UNLV test sets for Spanish.
-#                      based on runalltests.sh by Ray Smith
-# Author:      Shree Devi Kumar
-# Created:     June 09, 2018
-#
-# (C) Copyright 2007, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-if [ $# -ne 3 ]
-then
-   echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
-   exit 1
-fi
-if [ ! -d src/api ]
-then
-  echo "Run $0 from the tesseract-ocr root directory!"
-  exit 1
-fi
-if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
-then
-  echo "Please build tesseract before running $0"
-  exit 1
-fi
-tessdata=$3
-lang=$4
-
-#timesum computes the total cpu time
-timesum() {
-awk ' BEGIN {
-total = 0.0;
-}
-{
-  total += $2;
-}
-END {
-  printf("%.2f\n", total);
-}' "$1"
-}
-
-imdir="$1"
-vid="$2"
-bindir=${0%/*}
-if [ "$bindir" = "$0" ]
-then
-    bindir="./"
-fi
-rdir=unlvtests/reports
-
-testsets="spn.3B"
-
-totalerrs=0
-totalwerrs=0
-totalnswerrs=0
-for set in $testsets
-do
-    if [ -r "$imdir/$set/pages" ]
-    then
-	# Run tesseract on all the pages.
-	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
-	# Count the errors on all the pages.
-	$bindir/counttestset.sh "$imdir/$set/pages" "spa"
-	# Get the new character word and nonstop word errors and accuracy.
-	cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
-	    tr -d '[:blank:]')
-	nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
-	    cut -c10-17 |tr -d '[:blank:]')
-	nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
-	    cut -c19-26 |tr -d '[:blank:]')
-
-sumfile=$rdir/$vid.$set.sum
-        if [ -r "unlvtests/results/$set.times" ]
-        then
-          total_time=$(timesum "unlvtests/results/$set.times")
-          if [ -r "unlvtests/results/prev/$set.times" ]
-          then
-            paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
-              awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
-          fi
-	else
-          total_time='0.0'
-        fi
-        echo "RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy\
-	NonStopWordErrors	Accuracy	TimeTaken">"$sumfile"
-        echo "$vid	$set	$cherrs		$chacc		$wderrs		$wdacc\
-		$nswderrs			$nswdacc		${total_time}s" >>"$sumfile"
-    fi
-done
-
-cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
-
-mv "$rdir/$vid".*.sum unlvtests/results/
-cat "$rdir/$vid".summary
--- a/unlvtests/runtestset.sh
+++ b/unlvtests/runtestset.sh
@ -1,80 +0,0 @@
-#!/bin/bash
-# File:        runtestset.sh
-# Description: Script to run tesseract on a single UNLV set.
-# Author:      Ray Smith
-# Created:     Wed Jun 13 10:13:01 PDT 2007
-#
-# (C) Copyright 2007, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if  [ $# -ne 3 ] && [ $# -ne 4 ]
-then
-  echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
-  exit 1
-fi
-if [ ! -d src/api ]
-then
-  echo "Run $0 from the tesseract-ocr root directory!"
-  exit 1
-fi
-if [ ! -r src/api/tesseract ]
-then
-  if [ ! -r tesseract.exe ]
-  then
-    echo "Please build tesseract before running $0"
-    exit 1
-  else
-    tess="./tesseract.exe"
-  fi
-else
-  tess="time -f %U -o times.txt src/api/tesseract"
-  #tess="time -f %U -o times.txt tesseract"
-fi
-
-tessdata=$2
-lang=$3
-pages=$1
-imdir=${pages%/pages}
-setname=${imdir##*/}
-if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
-then
-  config=unlv.auto
-  resdir=unlvtests/results/zoning.$setname
-else
-  config=unlv
-  resdir=unlvtests/results/$setname
-fi
-echo -e "Testing on set $setname in directory $imdir to $resdir\n"
-mkdir -p "$resdir"
-rm -f "unlvtests/results/$setname.times"
-while read page dir
-do
-  # A pages file may be a list of files with subdirs or maybe just
-  # a plain list of files so accommodate both.
-  if [ "$dir" ]
-  then
-     srcdir="$imdir/$dir"
-  else
-     srcdir="$imdir"
-  fi
-#  echo "$srcdir/$page.tif"
-  $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
-  if [ -r times.txt ]
-  then
-    read t <times.txt
-    echo "$page $t" >>"unlvtests/results/$setname.times"
-    echo -e "\033M$page $t"
-    if [ "$t" = "Command terminated by signal 2" ]
-    then
-      exit 0
-    fi
-  fi
-done <"$pages"
				`@ -1 +0,0 @@`
				`1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%`
				`@ -1 +0,0 @@`
				`1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%`
				`@ -1 +0,0 @@`
				`1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%`
				`@ -1 +0,0 @@`
				`1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%`