move langtests and unlvtests from tesseract-ocr repository to test repository

This commit is contained in:
zdenop 2018-11-08 22:31:32 +01:00
parent cbef2ebe12
commit cdfb768010
43 changed files with 1 additions and 1034 deletions

View File

@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest
EXTRA_DIST = README.md LICENSE
EXTRA_DIST += aclocal.m4 config configure.ac autogen.sh
EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc
EXTRA_DIST += CMakeLists.txt tesseract.pc.cmake cmake VERSION src/vs2010 cppan.yml
DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)

View File

@ -463,7 +463,6 @@ fi
# Output files
AC_CONFIG_FILES([Makefile tesseract.pc])
AC_CONFIG_FILES([langtests/Makefile])
AC_CONFIG_FILES([src/api/Makefile])
AC_CONFIG_FILES([src/api/tess_version.h])
AC_CONFIG_FILES([src/arch/Makefile])
@ -481,7 +480,6 @@ AC_CONFIG_FILES([src/wordrec/Makefile])
AC_CONFIG_FILES([tessdata/Makefile])
AC_CONFIG_FILES([tessdata/configs/Makefile])
AC_CONFIG_FILES([tessdata/tessconfigs/Makefile])
AC_CONFIG_FILES([unlvtests/Makefile])
AC_CONFIG_FILES([unittest/Makefile])
AC_CONFIG_FILES([java/Makefile])
AC_CONFIG_FILES([java/com/Makefile])

View File

@ -1,2 +0,0 @@
#
results/*

View File

@ -1,8 +0,0 @@
EXTRA_DIST = README.md
EXTRA_DIST += frk_setup.sh
EXTRA_DIST += frk_test.sh
EXTRA_DIST += counttestset.sh
EXTRA_DIST += runlangtests.sh
EXTRA_DIST += runtestset.sh
EXTRA_DIST += reports/*

View File

@ -1,54 +0,0 @@
# Language tests.
The scripts in this directory make it possible to test Accuracy of Tesseract for different languages.
## Setup
### Step 1: If not already installed, download the modified ISRI toolkit,
make and install the tools in /usr/local/bin.
```
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
cd ~/ocr-evaluation-tools
sudo make install
```
### Step 2: If not alrady built, Build tesseract.
Use binaries from the tesseract/src/api and tesseract/src/training directory.
### Step 3
Download images and corresponding ground truth text for the language to be tested.
Each testset should have only one kind of images (eg. tif, png, jpg etc).
The ground truth text files should have the same base filename with txt extension.
As needed, modify the filenames and create the `pages` file for each testset.
Instructions for testing Fraktur and Sanskrit languages are given below as an example.
## Testing for Fraktur - frk and script/Fraktur
### Download the images and groundtruth, modify to required format.
```
bash -x frk_setup.sh
```
### Run tests for Fraktur - frk and script/Fraktur
```
bash -x frk_test.sh
```
## Testing for Sanskrit - san and script/Devanagari
### Download the images and groundtruth, modify to required format.
```
bash -x deva_setup.sh
```
### Run tests
```
bash -x deva_test.sh
```
### Notes from Nick White regarding wordacc
If you just want to remove all lines which have 100% recognition,
you can add a 'awk' command like this:
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
results.txt
or if you've already got a results file you want to change, you can do this:
awk '$3 != 100 {print $0}' results.txt newresults.txt
If you only want the last sections where things are broken down by
word, you can add a sed commend, like this:
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
!d' | awk '$3 != 100 {print $0}' results.txt

View File

@ -1,52 +0,0 @@
#!/bin/bash
# File: counttestset.sh
# Description: Script to count the errors on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 11:58:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 2 ]
then
echo "Usage:$0 pagesfile langcode"
exit 1
fi
pages=$1
langcode=$2
imdir=${pages%/pages}
setname=${imdir##*/}
resdir=langtests/results/$setname
mkdir -p langtests/reports
echo "Counting on set $setname in directory $imdir to $resdir"
accfiles=""
wafiles=""
while read page dir
do
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
echo "$srcdir/$page"
# Count character errors.
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.
ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
wafiles="$wafiles $resdir/$page.wa"
done <"$pages"
accsum $accfiles >"langtests/results/$setname.characc"
wordaccsum $wafiles >"langtests/results/$setname.wordacc"

View File

@ -1,18 +0,0 @@
#!/bin/bash
#
mkdir -p ~/lang-files
rm -rf ~/lang-files/san-*
for testset in vedic fontsamples oldstyle shreelipi alphabetsamples
do
cd ~/lang-files
mkdir -p ./san-$testset
cp ~/lang-deva-downloads/imagessan/$testset/*.* ./san-$testset/
cd ./san-$testset/
rename s/-gt.txt/.txt/ *.txt
ls -1 *.png >pages
sed -i -e 's/.png//g' pages
done
mkdir -p ~/lang-stopwords
cd ~/lang-stopwords
cp ~/lang-deva-downloads/imagessan/stopwords.txt ./san.stopwords.txt

View File

@ -1,18 +0,0 @@
#!/bin/bash
# run langtests/runlangtests.sh with the root data dir, testname, tessdata-dir, language code and image extension
cd ~/tesseract
langtests/runlangtests.sh ~/lang-files 4_fast_Devanagari ../tessdata_fast/script Devanagari png
langtests/runlangtests.sh ~/lang-files 4_best_int_Devanagari ../tessdata/script Devanagari png
langtests/runlangtests.sh ~/lang-files 4_best_Devanagari ../tessdata_best/script Devanagari png
langtests/runlangtests.sh ~/lang-files 4_fast_san ../tessdata_fast san png
langtests/runlangtests.sh ~/lang-files 4_best_int_san ../tessdata san png
langtests/runlangtests.sh ~/lang-files 4_best_san ../tessdata_best san png
langtests/runlangtests.sh ~/lang-files 4_plus40k_san ../tesstutorial-deva san png
#/home/ubuntu/tesstutorial-deva/san.traineddata at n iterations
### It takes a while to run.

View File

@ -1,24 +0,0 @@
#!/bin/bash
#
mkdir -p ~/lang-downloads
cd ~/lang-downloads
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
mkdir -p ~/lang-files
cd ~/lang-files
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
mkdir -p ./frk-ligatures
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
cp ./frk/gt/*.txt ./frk-ligatures/
cd ./frk-ligatures/
ls -1 *.tif >pages
sed -i -e 's/.tif//g' pages
mkdir -p ~/lang-stopwords
cd ~/lang-stopwords
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."

View File

@ -1,13 +0,0 @@
#!/bin/bash
#
# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
cd ~/tesseract
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur tif
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk tif
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk tif
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk tif
### It takes a while to run.

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_Devanagari san-alphabetsamples 2013 56.17% 1323 12.27% 1323 12.27 606.28s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_Devanagari san-fontsamples 388 94.82% 87 86.38% 87 86.38 570.17s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_Devanagari san-oldstyle 2796 59.93% 523 39.61% 523 39.61 447.73s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_Devanagari san-shreelipi 830 94.01% 311 81.40% 311 81.40 1137.51s

View File

@ -1,2 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_Devanagari san-alphabetsamples 2010 56.24% 1321 12.40% 1321 12.40 556.26s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_Devanagari san-fontsamples 396 94.72% 89 86.07% 89 86.07 524.07s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_Devanagari san-oldstyle 2812 59.70% 523 39.61% 523 39.61 416.57s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_Devanagari san-shreelipi 829 94.01% 314 81.22% 314 81.22 1087.02s

View File

@ -1,2 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 367.73s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_san san-alphabetsamples 2342 49.01% 1353 10.28% 1353 10.28 281.60s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_san san-fontsamples 474 93.68% 126 80.28% 126 80.28 281.05s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 206.20s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 606.80s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_san san-alphabetsamples 2335 49.16% 1348 10.61% 1348 10.61 300.24s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_san san-fontsamples 473 93.69% 126 80.28% 126 80.28 267.05s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_san san-oldstyle 3121 55.27% 598 30.95% 598 30.95 205.28s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_san san-shreelipi 1168 91.56% 414 75.24% 414 75.24 610.52s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_Devanagari san-alphabetsamples 2017 56.09% 1317 12.67% 1317 12.67 400.38s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_Devanagari san-fontsamples 433 94.22% 108 83.10% 108 83.10 287.48s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_Devanagari san-oldstyle 2883 58.68% 543 37.30% 543 37.30 289.85s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_Devanagari san-shreelipi 750 94.58% 279 83.31% 279 83.31 813.19s

View File

@ -1,2 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s

View File

@ -1,2 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_san san-alphabetsamples 2342 49.01% 1353 10.28% 1353 10.28 276.73s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_san san-fontsamples 474 93.68% 126 80.28% 126 80.28 278.34s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 222.35s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 626.40s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus10k_san san-alphabetsamples 1725 62.44% 1112 26.26% 1112 26.26 160.48s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus10k_san san-fontsamples 349 95.34% 73 88.58% 73 88.58 138.09s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus10k_san san-oldstyle 2818 59.62% 548 36.72% 548 36.72 120.83s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus10k_san san-shreelipi 746 94.61% 279 83.31% 279 83.31 292.70s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus20k_san san-alphabetsamples 1441 68.63% 841 44.23% 841 44.23 156.57s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus20k_san san-fontsamples 356 95.25% 75 88.26% 75 88.26 135.13s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus20k_san san-oldstyle 2862 58.99% 555 35.91% 555 35.91 118.21s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus20k_san san-shreelipi 726 94.76% 267 84.03% 267 84.03 295.68s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus30k_san san-alphabetsamples 1656 63.95% 937 37.86% 937 37.86 615.62s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus30k_san san-fontsamples 429 94.28% 89 86.07% 89 86.07 617.42s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus30k_san san-oldstyle 2885 58.66% 561 35.22% 561 35.22 432.58s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus30k_san san-shreelipi 447 96.77% 123 92.64% 123 92.64 1081.29s

View File

@ -1,8 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus40k_san san-alphabetsamples 1380 69.95% 775 48.61% 775 48.61 1198.16s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus40k_san san-fontsamples 401 94.65% 79 87.64% 79 87.64 1275.08s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus40k_san san-oldstyle 2860 59.01% 534 38.34% 534 38.34 977.65s
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_plus40k_san san-shreelipi 441 96.81% 113 93.24% 113 93.24 2301.53s

View File

@ -1,105 +0,0 @@
#!/bin/bash
##############################################################################
# File: runlangtests.sh
# Description: Script to run a set of accuracy test sets for any language.
# based on runalltests.sh by Ray Smith
# Author: Shree Devi Kumar
# Created: June 09, 2018
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
if [ $# -ne 5 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode imgext"
exit 1
fi
tessdata=$3
lang=$4
imgext=$5
#timesum computes the total cpu time
timesum() {
awk ' BEGIN {
total = 0.0;
}
{
total += $2;
}
END {
printf("%.2f\n", total);
}' "$1"
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=langtests/reports
if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ]
then
testsets="frk-ligatures"
fi
if [ "$lang" = "san" ] || [ "$lang" = "Devanagari" ]
then
testsets="san-fontsamples san-oldstyle san-shreelipi san-alphabetsamples"
### testsets="san-fontsamples"
fi
totalerrs=0
totalwerrs=0
totalnswerrs=0
for set in $testsets
do
resdir=langtests/results/$set
mkdir -p "$resdir"
cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" "$imgext"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" $lang
# Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]')
sumfile=$rdir/$vid.$set.sum
if [ -r "langtests/results/$set.times" ]
then
total_time=$(timesum "langtests/results/$set.times")
else
total_time='0.0'
fi
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
NonStopWErrors Accuracy TimeTaken">"$sumfile"
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
fi
done
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
mv "$rdir/$vid".*.sum langtests/results/
cat "$rdir/$vid".summary

View File

@ -1,61 +0,0 @@
#!/bin/bash
# File: runtestset.sh
# Description: Script to run tesseract on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 10:13:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 4 ]
then
echo "Usage:$0 pagesfile tessdata-dir langcode imgext"
exit 1
fi
tess="time -f %U -o times.txt ./src/api/tesseract"
tessdata=$2
langcode=$3
imgext=$4
pages=$1
imdir=${pages%/pages}
setname=${imdir##*/}
config=""
resdir=langtests/results/$setname
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
mkdir -p "$resdir"
rm -f "langtests/results/$setname.times"
while read page dir
do
# A pages file may be a list of files with subdirs or maybe just
# a plain list of files so accommodate both.
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
echo "$srcdir/$page"
$tess "$srcdir/$page.$imgext" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
if [ -r times.txt ]
then
read t <times.txt
echo "$page $t" >>"langtests/results/$setname.times"
echo -e "\033M$page $t"
if [ "$t" = "Command terminated by signal 2" ]
then
exit 0
fi
fi
done <"$pages"

View File

@ -1,12 +0,0 @@
EXTRA_DIST = README.md
EXTRA_DIST += counttestset.sh
EXTRA_DIST += runalltests.sh
EXTRA_DIST += runalltests_spa.sh
EXTRA_DIST += runtestset.sh
EXTRA_DIST += reports/1995.bus.3B.sum
EXTRA_DIST += reports/1995.doe3.3B.sum
EXTRA_DIST += reports/1995.mag.3B.sum
EXTRA_DIST += reports/1995.news.3B.sum
EXTRA_DIST += reports/2.03.summary
EXTRA_DIST += reports/2.04.summary

View File

@ -1,94 +0,0 @@
## How to run UNLV tests.
The scripts in this directory make it possible to duplicate the tests
published in the Fourth Annual Test of OCR Accuracy.
See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf
but first you have to get the tools and data used by UNLV:
### Step 1: to download the images go to
https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/
and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
spn.3B.tar.gz is incorrect in this repo, so get it from code.google
```
mkdir -p ~/isri-downloads
cd ~/isri-downloads
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
```
### Step 2: extract the files.
It doesn't really matter where
in your filesystem you put them, but they must go under a common
root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example,
~/ISRI-OCRtk.
```
mkdir -p ~/ISRI-OCRtk
cd ~/ISRI-OCRtk
tar xzvf ~/isri-downloads/bus.3B.tar.gz
tar xzvf ~/isri-downloads/doe3.3B.tar.gz
tar xzvf ~/isri-downloads/mag.3B.tar.gz
tar xzvf ~/isri-downloads/news.3B.tar.gz
tar xzvf ~/isri-downloads/spn.3B.tar.gz
mkdir -p stopwords
cd stopwords
wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
```
Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
wordacc uses a space delimited stopwords file, not line delimited.
s/\n/ /g
Edit ~/ISRI-OCRtk/spn.3B/pages
Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717).
7733_005.3B 3
### Step 3: Download the modified ISRI toolkit, make and install the tools :
These will be installed in /usr/local/bin.
```
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
cd ~/ocr-evaluation-tools
sudo make install
```
### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:
```
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
```
and go to the gym, have lunch etc. It takes a while to run.
### Step 6: There should be a RELEASE.summary file
*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
report and comparison with the 1995 results.
### Step 7: run the test for Spanish.
```
unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
```
#### Notes from Nick White regarding wordacc
If you just want to remove all lines which have 100% recognition,
you can add a 'awk' command like this:
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
results.txt
or if you've already got a results file you want to change, you can do this:
awk '$3 != 100 {print $0}' results.txt newresults.txt
If you only want the last sections where things are broken down by
word, you can add a sed command, like this:
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
!d' | awk '$3 != 100 {print $0}' results.txt

View File

@ -1,68 +0,0 @@
#!/bin/bash
# File: counttestset.sh
# Description: Script to count the errors on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 11:58:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 2 ]
then
echo "Usage:$0 pagesfile langcode"
exit 1
fi
if [ ! -d src/api ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
pages=$1
langcode=$2
imdir=${pages%/pages}
setname=${imdir##*/}
resdir=unlvtests/results/$setname
mkdir -p unlvtests/reports
echo "Counting on set $setname in directory $imdir to $resdir"
accfiles=""
wafiles=""
while read page dir
do
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
#echo "$srcdir/$page.tif"
# Convert groundtruth and recognized text to UTF-8 to correctly treat accented letters.
iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
# Count character errors.
ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.
#langcode should be either eng or spa
if [ "$langcode" = "eng" ]
then
ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
else
cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
fi
wafiles="$wafiles $resdir/$page.wa"
done <"$pages"
accsum $accfiles >"unlvtests/results/$setname.characc"
wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"

View File

@ -1,53 +0,0 @@
#!/bin/bash
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 1 ]
then
echo "Usage:$0 scantype"
echo "UNLV data comes in several scan types:"
echo "3B=300 dpi binary"
echo "3A=adaptive thresholded 300 dpi"
echo "3G=300 dpi grey"
echo "4B=400dpi binary"
echo "2B=200dpi binary"
echo "For now we only use 3B"
exit 1
fi
ext=$1
#There are several test sets without meaningful names, so rename
#them with something a bit more meaningful.
#Each s is oldname/newname
for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
do
old=${s%/*}
#if this set was downloaded then process it.
if [ -r "$old/PAGES" ]
then
new=${s#*/}.$ext
mkdir -p "$new"
echo "Set $old -> $new"
#The pages file had - instead of _ so fix it and add the extension.
for page in $(cat $old/PAGES)
do
echo "${page%-*}_${page#*-}.$ext"
done >"$new/pages"
for f in $(cat "$new/pages")
do
#Put a tif extension on the tif files.
cp "$old/${old}_B/$f" "$new/$f.tif"
#Put a uzn extension on the zone files.
cp "$old/${old}_B/${f}Z" "$new/$f.uzn"
#Cat all the truth files together and put into a single txt file.
cat "$old/${old}_GT/${f%.$ext}".Z* >"$new/$f.txt"
done
fi
done

View File

@ -1 +0,0 @@
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%

View File

@ -1 +0,0 @@
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%

View File

@ -1 +0,0 @@
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%

View File

@ -1 +0,0 @@
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%

View File

@ -1,9 +0,0 @@
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
2.03 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26%
2.03 doe3.3B 29520 97.98% -18.79% 7966 96.27% 1.79% 6764 95.07 -3.95%
2.03 mag.3B 14568 97.81% -3.16% 4288 96.25% -6.09% 3054 95.47 -9.62%
2.03 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57%
2.03 Total 58165 - -8.81% 15734 - -1.47% 12387 - -6.27%

View File

@ -1,9 +0,0 @@
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
2.04 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26%
2.04 doe3.3B 29514 97.98% -18.80% 7963 96.27% 1.75% 6762 95.07 -3.98%
2.04 mag.3B 14568 97.81% -3.16% 4289 96.25% -6.07% 3053 95.47 -9.65%
2.04 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57%
2.04 Total 58159 - -8.82% 15732 - -1.48% 12384 - -6.30%

View File

@ -1,2 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
4_best_int_spa spn.3B 2846 99.18% 937 98.39% 739 97.54 6478.02s

View File

@ -1,2 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
4_best_spa spn.3B 2823 99.19% 924 98.41% 729 97.57 7233.76s

View File

@ -1,9 +0,0 @@
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
4_fast_eng bus.3B 6124 98.11% 2.77% 1138 97.88% -30.23% 963 97.05 -25.52% 3935.26s
4_fast_eng doe3.3B 30029 97.96% -17.39% 13781 94.45% 76.09% 13178 92.38 87.13% 18847.36s
4_fast_eng mag.3B 10934 98.37% -27.32% 3343 97.15% -26.78% 2813 96.06 -16.75% 6867.14s
4_fast_eng news.3B 5734 98.84% -10.85% 1322 98.45% -32.07% 1040 97.94 -30.76% 5527.38s
4_fast_eng Total 52821 - -17.19% 19584 - 22.64% 17994 - 36.15%

View File

@ -1,2 +0,0 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
4_fast_spa spn.3B 2841 99.18% 879 98.49% 742 97.53 3838.82s

View File

@ -1,135 +0,0 @@
#!/bin/bash
# File: runalltests.sh
# Description: Script to run a set of UNLV test sets for English.
# Author: Ray Smith
# Created: Thu Jun 14 08:21:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 3 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
exit 1
fi
if [ ! -d src/api ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
fi
tessdata=$3
#deltapc new old calculates the %change from old to new
deltapc() {
awk ' BEGIN {
printf("%.2f", 100.0*('"$1"'-'"$2"')/'"$2"');
}'
}
#timesum computes the total cpu time
timesum() {
awk ' BEGIN {
total = 0.0;
}
{
total += $2;
}
END {
printf("%.2f\n", total);
}' "$1"
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=unlvtests/reports
testsets="bus.3B doe3.3B mag.3B news.3B"
#testsets="bus.3B"
totalerrs=0
totalwerrs=0
totalnswerrs=0
totalolderrs=0
totaloldwerrs=0
totaloldnswerrs=0
for set in $testsets
do
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" "eng"
# Get the old character word and nonstop word errors.
olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum")
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
# Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]')
# Compute the percent change.
chdelta=$(deltapc "$cherrs" "$olderrs")
wdelta=$(deltapc "$wderrs" "$oldwerrs")
nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
sumfile=$rdir/$vid.$set.sum
if [ -r "unlvtests/results/$set.times" ]
then
total_time=$(timesum "unlvtests/results/$set.times")
if [ -r "unlvtests/results/prev/$set.times" ]
then
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
fi
else
total_time='0.0'
fi
echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\
$wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >"$sumfile"
# Sum totals over all the testsets.
let totalerrs=totalerrs+cherrs
let totalwerrs=totalwerrs+wderrs
let totalnswerrs=totalnswerrs+nswderrs
let totalolderrs=totalolderrs+olderrs
let totaloldwerrs=totaloldwerrs+oldwerrs
let totaloldnswerrs=totaloldnswerrs+oldnswerrs
fi
done
# Compute grand total percent change.
chdelta=$(deltapc $totalerrs $totalolderrs)
wdelta=$(deltapc $totalwerrs $totaloldwerrs)
nswdelta=$(deltapc $totalnswerrs $totaloldnswerrs)
tfile=$rdir/$vid.total.sum
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
- $wdelta% $totalnswerrs - $nswdelta%" >"$tfile"
cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
mv "$rdir/$vid".*.sum unlvtests/results/
cat "$rdir/$vid".summary

View File

@ -1,109 +0,0 @@
#!/bin/bash
##############################################################################
# File: runalltests_spa.sh
# Description: Script to run a set of UNLV test sets for Spanish.
# based on runalltests.sh by Ray Smith
# Author: Shree Devi Kumar
# Created: June 09, 2018
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
if [ $# -ne 3 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
exit 1
fi
if [ ! -d src/api ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
fi
tessdata=$3
lang=$4
#timesum computes the total cpu time
timesum() {
awk ' BEGIN {
total = 0.0;
}
{
total += $2;
}
END {
printf("%.2f\n", total);
}' "$1"
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=unlvtests/reports
testsets="spn.3B"
totalerrs=0
totalwerrs=0
totalnswerrs=0
for set in $testsets
do
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" "spa"
# Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]')
sumfile=$rdir/$vid.$set.sum
if [ -r "unlvtests/results/$set.times" ]
then
total_time=$(timesum "unlvtests/results/$set.times")
if [ -r "unlvtests/results/prev/$set.times" ]
then
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
fi
else
total_time='0.0'
fi
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
NonStopWordErrors Accuracy TimeTaken">"$sumfile"
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
fi
done
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
mv "$rdir/$vid".*.sum unlvtests/results/
cat "$rdir/$vid".summary

View File

@ -1,80 +0,0 @@
#!/bin/bash
# File: runtestset.sh
# Description: Script to run tesseract on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 10:13:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 3 ] && [ $# -ne 4 ]
then
echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
exit 1
fi
if [ ! -d src/api ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r src/api/tesseract ]
then
if [ ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
else
tess="./tesseract.exe"
fi
else
tess="time -f %U -o times.txt src/api/tesseract"
#tess="time -f %U -o times.txt tesseract"
fi
tessdata=$2
lang=$3
pages=$1
imdir=${pages%/pages}
setname=${imdir##*/}
if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
then
config=unlv.auto
resdir=unlvtests/results/zoning.$setname
else
config=unlv
resdir=unlvtests/results/$setname
fi
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
mkdir -p "$resdir"
rm -f "unlvtests/results/$setname.times"
while read page dir
do
# A pages file may be a list of files with subdirs or maybe just
# a plain list of files so accommodate both.
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
# echo "$srcdir/$page.tif"
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
if [ -r times.txt ]
then
read t <times.txt
echo "$page $t" >>"unlvtests/results/$setname.times"
echo -e "\033M$page $t"
if [ "$t" = "Command terminated by signal 2" ]
then
exit 0
fi
fi
done <"$pages"