diff --git a/Makefile.am b/Makefile.am index 947ccb5b7..de6af2ebd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest EXTRA_DIST = README.md\ aclocal.m4 config configure.ac autogen.sh contrib \ - tesseract.pc.in $(TRAINING_SUBDIR) java doc unlvtests + tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR) diff --git a/configure.ac b/configure.ac index e74e5f178..d27f5ce38 100644 --- a/configure.ac +++ b/configure.ac @@ -466,6 +466,7 @@ fi # Output files AC_CONFIG_FILES([Makefile tesseract.pc]) +AC_CONFIG_FILES([langtests/Makefile]) AC_CONFIG_FILES([src/api/Makefile]) AC_CONFIG_FILES([src/api/tess_version.h]) AC_CONFIG_FILES([src/arch/Makefile]) diff --git a/langtests/.gitignore b/langtests/.gitignore new file mode 100644 index 000000000..d9f9b7fa1 --- /dev/null +++ b/langtests/.gitignore @@ -0,0 +1,2 @@ +# +results/* diff --git a/langtests/Makefile.am b/langtests/Makefile.am new file mode 100644 index 000000000..2103eeef8 --- /dev/null +++ b/langtests/Makefile.am @@ -0,0 +1,8 @@ + +EXTRA_DIST = README.md +EXTRA_DIST += frk_setup.sh +EXTRA_DIST += frk_test.sh +EXTRA_DIST += counttestset.sh +EXTRA_DIST += runlangtests.sh +EXTRA_DIST += runtestset.sh +EXTRA_DIST += reports/* diff --git a/langtests/README.md b/langtests/README.md new file mode 100644 index 000000000..af4b6095c --- /dev/null +++ b/langtests/README.md @@ -0,0 +1,98 @@ +# How to run Language tests. + +The scripts in this directory make it possible to test Accuracy of Tesseract +for different languages. + +### Step 1: If not already installed, download the modified ISRI toolkit, +make and install the tools in /usr/local/bin. + +``` +git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git +cd ~/ocr-evaluation-tools +sudo make install +``` + +### Step 2: If not alrady installed, Build tesseract. + +## Testing for Fraktur - frk and script/Fraktur + +### Step 3: download the images and groundtruth + +``` +mkdir -p ~/lang-downloads +cd ~/lang-downloads +wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip +wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip +``` + +### Step 4: extract the files. +It doesn't really matter where in your filesystem you put them, +but they must go under a common root, for example, ~/lang-files + +``` +mkdir -p ~/lang-files +cd ~/lang-files +unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk +unzip ~/lang-downloads/frk-stweil-gt.zip -d frk +mkdir -p ./frk-ligatures +cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/ +cp ./frk/gt/*.txt ./frk-ligatures/ + +cd ./frk-ligatures/ +ls -1 *.tif >pages +sed -i -e 's/.tif//g' pages +cat pages +``` + +``` +mkdir -p ~/lang-stopwords +cd ~/lang-stopwords +wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt +``` +Edit ~/lang-files/stopwords/frk.stopwords.txt as +wordacc uses a space delimited stopwords file, not line delimited. + +``` +sed -i -e 's/\n/ /g' frk.stopwords.txt +cat frk.stopwords.txt +``` + +### Step 5: run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code: + +``` +cd ~/tesseract +langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur +langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk +langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk +langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk + + + + +langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-Fraktur /home/ubuntu/tessdata_frk/frk-finetune-impact frk +langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-frk /home/ubuntu/tessdata_frk/frk-finetune-frk frk +``` +and go to the gym, have lunch etc. It takes a while to run. + +### Step 6: There should be a RELEASE.summary file +*langtests/reports/4-beta_fast.summary* that contains the final summarized accuracy + +``` + +#### Notes from Nick White regarding wordacc + +If you just want to remove all lines which have 100% recognition, +you can add a 'awk' command like this: + +ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}' +results.txt + +or if you've already got a results file you want to change, you can do this: + +awk '$3 != 100 {print $0}' results.txt newresults.txt + +If you only want the last sections where things are broken down by +word, you can add a sed commend, like this: + +ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$ +!d' | awk '$3 != 100 {print $0}' results.txt diff --git a/langtests/counttestset.sh b/langtests/counttestset.sh new file mode 100755 index 000000000..d9ef4ce01 --- /dev/null +++ b/langtests/counttestset.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# File: counttestset.sh +# Description: Script to count the errors on a single UNLV set. +# Author: Ray Smith +# Created: Wed Jun 13 11:58:01 PDT 2007 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# -ne 2 ] +then + echo "Usage:$0 pagesfile langcode" + exit 1 +fi + +pages=$1 +langcode=$2 + +imdir=${pages%/pages} +setname=${imdir##*/} +resdir=langtests/results/$setname +mkdir -p langtests/reports +echo "Counting on set $setname in directory $imdir to $resdir" +accfiles="" +wafiles="" +while read page dir +do + if [ "$dir" ] + then + srcdir="$imdir/$dir" + else + srcdir="$imdir" + fi + echo "$srcdir/$page.tif" + # Count character errors. + ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc" + accfiles="$accfiles $resdir/$page.acc" + # Count word errors. + ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa" + wafiles="$wafiles $resdir/$page.wa" +done <"$pages" + +accsum $accfiles >"langtests/results/$setname.characc" +wordaccsum $wafiles >"langtests/results/$setname.wordacc" diff --git a/langtests/frk_setup.sh b/langtests/frk_setup.sh new file mode 100644 index 000000000..e86b6109f --- /dev/null +++ b/langtests/frk_setup.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +mkdir -p ~/lang-downloads +cd ~/lang-downloads +wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip +wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip + +mkdir -p ~/lang-files +cd ~/lang-files +unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk +unzip ~/lang-downloads/frk-stweil-gt.zip -d frk +mkdir -p ./frk-ligatures +cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/ +cp ./frk/gt/*.txt ./frk-ligatures/ + +cd ./frk-ligatures/ +ls -1 *.tif >pages +sed -i -e 's/.tif//g' pages + +mkdir -p ~/lang-stopwords +cd ~/lang-stopwords +wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt + +echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited." diff --git a/langtests/frk_test.sh b/langtests/frk_test.sh new file mode 100644 index 000000000..83078ca96 --- /dev/null +++ b/langtests/frk_test.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# +# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code: + +cd ~/tesseract +langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur + +langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk +langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk +langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk + +### It takes a while to run. + diff --git a/langtests/reports/4_best_frk.summary b/langtests/reports/4_best_frk.summary new file mode 100644 index 000000000..0b963f682 --- /dev/null +++ b/langtests/reports/4_best_frk.summary @@ -0,0 +1,2 @@ +RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken +4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s diff --git a/langtests/reports/4_best_int_frk.summary b/langtests/reports/4_best_int_frk.summary new file mode 100644 index 000000000..20df4cd8e --- /dev/null +++ b/langtests/reports/4_best_int_frk.summary @@ -0,0 +1,2 @@ +RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken +4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.80s diff --git a/langtests/reports/4_fast_Fraktur.summary b/langtests/reports/4_fast_Fraktur.summary new file mode 100644 index 000000000..b8f8e81b7 --- /dev/null +++ b/langtests/reports/4_fast_Fraktur.summary @@ -0,0 +1,2 @@ +RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken +4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s diff --git a/langtests/reports/4_fast_frk.summary b/langtests/reports/4_fast_frk.summary new file mode 100644 index 000000000..42ce1bcd3 --- /dev/null +++ b/langtests/reports/4_fast_frk.summary @@ -0,0 +1,2 @@ +RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken +4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s diff --git a/langtests/runlangtests.sh b/langtests/runlangtests.sh new file mode 100755 index 000000000..0af075cb3 --- /dev/null +++ b/langtests/runlangtests.sh @@ -0,0 +1,100 @@ +#!/bin/bash +############################################################################## +# File: runalltests_spa.sh +# Description: Script to run a set of UNLV test sets for Spanish. +# based on runalltests.sh by Ray Smith +# Author: Shree Devi Kumar +# Created: June 09, 2018 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +if [ $# -ne 4 ] +then + echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode" + exit 1 +fi + +tessdata=$3 +lang=$4 + +#timesum computes the total cpu time +timesum() { +awk ' BEGIN { +total = 0.0; +} +{ + total += $2; +} +END { + printf("%.2f\n", total); +}' "$1" +} + +imdir="$1" +vid="$2" +bindir=${0%/*} +if [ "$bindir" = "$0" ] +then + bindir="./" +fi +rdir=langtests/reports +if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ] + then + testsets="frk-ligatures" +fi + +totalerrs=0 +totalwerrs=0 +totalnswerrs=0 +for set in $testsets +do + resdir=langtests/results/$set + mkdir -p "$resdir" + cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords" + if [ -r "$imdir/$set/pages" ] + then + # Run tesseract on all the pages. + $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" $lang + # Count the errors on all the pages. + $bindir/counttestset.sh "$imdir/$set/pages" $lang + # Get the new character word and nonstop word errors and accuracy. + cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 | + cut -c10-17 |tr -d '[:blank:]') + nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 | + cut -c19-26 |tr -d '[:blank:]') + + sumfile=$rdir/$vid.$set.sum + if [ -r "langtests/results/$set.times" ] + then + total_time=$(timesum "langtests/results/$set.times") + else + total_time='0.0' + fi + echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\ + NonStopWErrors Accuracy TimeTaken">"$sumfile" + echo "$vid $set $cherrs $chacc $wderrs $wdacc\ + $nswderrs $nswdacc ${total_time}s" >>"$sumfile" + fi +done + +cat "$rdir/$vid".*.sum >"$rdir/$vid".summary + +mv "$rdir/$vid".*.sum langtests/results/ +cat "$rdir/$vid".summary diff --git a/langtests/runtestset.sh b/langtests/runtestset.sh new file mode 100755 index 000000000..fc12f40eb --- /dev/null +++ b/langtests/runtestset.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# File: runtestset.sh +# Description: Script to run tesseract on a single UNLV set. +# Author: Ray Smith +# Created: Wed Jun 13 10:13:01 PDT 2007 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# -ne 3 ] +then + echo "Usage:$0 pagesfile tessdata-dir langcode " + exit 1 +fi + +tess="time -f %U -o times.txt ./src/api/tesseract" + +tessdata=$2 +langcode=$3 +pages=$1 +imdir=${pages%/pages} +setname=${imdir##*/} + +config="" +resdir=langtests/results/$setname + +echo -e "Testing on set $setname in directory $imdir to $resdir\n" +mkdir -p "$resdir" +rm -f "langtests/results/$setname.times" +while read page dir +do + # A pages file may be a list of files with subdirs or maybe just + # a plain list of files so accommodate both. + if [ "$dir" ] + then + srcdir="$imdir/$dir" + else + srcdir="$imdir" + fi + echo "$srcdir/$page.tif" + $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" + if [ -r times.txt ] + then + read t >"langtests/results/$setname.times" + echo -e "\033M$page $t" + if [ "$t" = "Command terminated by signal 2" ] + then + exit 0 + fi + fi +done <"$pages"