Add langtests framework with frk example

This commit is contained in:
Shree Devi Kumar 2018-08-30 14:28:34 +00:00
parent a849860c6e
commit 92922b421c
14 changed files with 367 additions and 1 deletions

View File

@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest
EXTRA_DIST = README.md\
aclocal.m4 config configure.ac autogen.sh contrib \
tesseract.pc.in $(TRAINING_SUBDIR) java doc unlvtests
tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)

View File

@ -466,6 +466,7 @@ fi
# Output files
AC_CONFIG_FILES([Makefile tesseract.pc])
AC_CONFIG_FILES([langtests/Makefile])
AC_CONFIG_FILES([src/api/Makefile])
AC_CONFIG_FILES([src/api/tess_version.h])
AC_CONFIG_FILES([src/arch/Makefile])

2
langtests/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
#
results/*

8
langtests/Makefile.am Normal file
View File

@ -0,0 +1,8 @@
EXTRA_DIST = README.md
EXTRA_DIST += frk_setup.sh
EXTRA_DIST += frk_test.sh
EXTRA_DIST += counttestset.sh
EXTRA_DIST += runlangtests.sh
EXTRA_DIST += runtestset.sh
EXTRA_DIST += reports/*

98
langtests/README.md Normal file
View File

@ -0,0 +1,98 @@
# How to run Language tests.
The scripts in this directory make it possible to test Accuracy of Tesseract
for different languages.
### Step 1: If not already installed, download the modified ISRI toolkit,
make and install the tools in /usr/local/bin.
```
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
cd ~/ocr-evaluation-tools
sudo make install
```
### Step 2: If not alrady installed, Build tesseract.
## Testing for Fraktur - frk and script/Fraktur
### Step 3: download the images and groundtruth
```
mkdir -p ~/lang-downloads
cd ~/lang-downloads
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
```
### Step 4: extract the files.
It doesn't really matter where in your filesystem you put them,
but they must go under a common root, for example, ~/lang-files
```
mkdir -p ~/lang-files
cd ~/lang-files
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
mkdir -p ./frk-ligatures
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
cp ./frk/gt/*.txt ./frk-ligatures/
cd ./frk-ligatures/
ls -1 *.tif >pages
sed -i -e 's/.tif//g' pages
cat pages
```
```
mkdir -p ~/lang-stopwords
cd ~/lang-stopwords
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
```
Edit ~/lang-files/stopwords/frk.stopwords.txt as
wordacc uses a space delimited stopwords file, not line delimited.
```
sed -i -e 's/\n/ /g' frk.stopwords.txt
cat frk.stopwords.txt
```
### Step 5: run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
```
cd ~/tesseract
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-Fraktur /home/ubuntu/tessdata_frk/frk-finetune-impact frk
langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-frk /home/ubuntu/tessdata_frk/frk-finetune-frk frk
```
and go to the gym, have lunch etc. It takes a while to run.
### Step 6: There should be a RELEASE.summary file
*langtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
```
#### Notes from Nick White regarding wordacc
If you just want to remove all lines which have 100% recognition,
you can add a 'awk' command like this:
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
results.txt
or if you've already got a results file you want to change, you can do this:
awk '$3 != 100 {print $0}' results.txt newresults.txt
If you only want the last sections where things are broken down by
word, you can add a sed commend, like this:
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
!d' | awk '$3 != 100 {print $0}' results.txt

52
langtests/counttestset.sh Executable file
View File

@ -0,0 +1,52 @@
#!/bin/bash
# File: counttestset.sh
# Description: Script to count the errors on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 11:58:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 2 ]
then
echo "Usage:$0 pagesfile langcode"
exit 1
fi
pages=$1
langcode=$2
imdir=${pages%/pages}
setname=${imdir##*/}
resdir=langtests/results/$setname
mkdir -p langtests/reports
echo "Counting on set $setname in directory $imdir to $resdir"
accfiles=""
wafiles=""
while read page dir
do
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
echo "$srcdir/$page.tif"
# Count character errors.
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.
ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
wafiles="$wafiles $resdir/$page.wa"
done <"$pages"
accsum $accfiles >"langtests/results/$setname.characc"
wordaccsum $wafiles >"langtests/results/$setname.wordacc"

24
langtests/frk_setup.sh Normal file
View File

@ -0,0 +1,24 @@
#!/bin/bash
#
mkdir -p ~/lang-downloads
cd ~/lang-downloads
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
mkdir -p ~/lang-files
cd ~/lang-files
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
mkdir -p ./frk-ligatures
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
cp ./frk/gt/*.txt ./frk-ligatures/
cd ./frk-ligatures/
ls -1 *.tif >pages
sed -i -e 's/.tif//g' pages
mkdir -p ~/lang-stopwords
cd ~/lang-stopwords
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."

13
langtests/frk_test.sh Normal file
View File

@ -0,0 +1,13 @@
#!/bin/bash
#
# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
cd ~/tesseract
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
### It takes a while to run.

View File

@ -0,0 +1,2 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s

View File

@ -0,0 +1,2 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.80s

View File

@ -0,0 +1,2 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s

View File

@ -0,0 +1,2 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s

100
langtests/runlangtests.sh Executable file
View File

@ -0,0 +1,100 @@
#!/bin/bash
##############################################################################
# File: runalltests_spa.sh
# Description: Script to run a set of UNLV test sets for Spanish.
# based on runalltests.sh by Ray Smith
# Author: Shree Devi Kumar
# Created: June 09, 2018
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
if [ $# -ne 4 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode"
exit 1
fi
tessdata=$3
lang=$4
#timesum computes the total cpu time
timesum() {
awk ' BEGIN {
total = 0.0;
}
{
total += $2;
}
END {
printf("%.2f\n", total);
}' "$1"
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=langtests/reports
if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ]
then
testsets="frk-ligatures"
fi
totalerrs=0
totalwerrs=0
totalnswerrs=0
for set in $testsets
do
resdir=langtests/results/$set
mkdir -p "$resdir"
cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" $lang
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" $lang
# Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]')
sumfile=$rdir/$vid.$set.sum
if [ -r "langtests/results/$set.times" ]
then
total_time=$(timesum "langtests/results/$set.times")
else
total_time='0.0'
fi
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
NonStopWErrors Accuracy TimeTaken">"$sumfile"
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
fi
done
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
mv "$rdir/$vid".*.sum langtests/results/
cat "$rdir/$vid".summary

60
langtests/runtestset.sh Executable file
View File

@ -0,0 +1,60 @@
#!/bin/bash
# File: runtestset.sh
# Description: Script to run tesseract on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 10:13:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 3 ]
then
echo "Usage:$0 pagesfile tessdata-dir langcode "
exit 1
fi
tess="time -f %U -o times.txt ./src/api/tesseract"
tessdata=$2
langcode=$3
pages=$1
imdir=${pages%/pages}
setname=${imdir##*/}
config=""
resdir=langtests/results/$setname
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
mkdir -p "$resdir"
rm -f "langtests/results/$setname.times"
while read page dir
do
# A pages file may be a list of files with subdirs or maybe just
# a plain list of files so accommodate both.
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
echo "$srcdir/$page.tif"
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
if [ -r times.txt ]
then
read t <times.txt
echo "$page $t" >>"langtests/results/$setname.times"
echo -e "\033M$page $t"
if [ "$t" = "Command terminated by signal 2" ]
then
exit 0
fi
fi
done <"$pages"