Add langtests framework with frk example

2025-01-18 06:30:14 +08:00 · 2018-08-30 14:28:34 +00:00 · 2018-08-30 14:28:34 +00:00 · 92922b421c
commit 92922b421c
parent a849860c6e
14 changed files with 367 additions and 1 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest
 EXTRA_DIST = README.md\
 	aclocal.m4 config configure.ac autogen.sh contrib \
-	tesseract.pc.in $(TRAINING_SUBDIR) java doc unlvtests
+	tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
 DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)
--- a/configure.ac
+++ b/configure.ac
@ -466,6 +466,7 @@ fi
 # Output files
 AC_CONFIG_FILES([Makefile tesseract.pc])
 AC_CONFIG_FILES([langtests/Makefile])
 AC_CONFIG_FILES([src/api/Makefile])
 AC_CONFIG_FILES([src/api/tess_version.h])
 AC_CONFIG_FILES([src/arch/Makefile])
--- a/langtests/.gitignore
+++ b/langtests/.gitignore
@ -0,0 +1,2 @@
 #
 results/*
--- a/langtests/Makefile.am
+++ b/langtests/Makefile.am
@ -0,0 +1,8 @@
 EXTRA_DIST = README.md
 EXTRA_DIST += frk_setup.sh
 EXTRA_DIST += frk_test.sh
 EXTRA_DIST += counttestset.sh
 EXTRA_DIST += runlangtests.sh
 EXTRA_DIST += runtestset.sh
 EXTRA_DIST += reports/*
--- a/langtests/README.md
+++ b/langtests/README.md
@ -0,0 +1,98 @@
 # How to run Language tests.
 The scripts in this directory make it possible to test Accuracy of Tesseract 
 for different languages. 
 ### Step 1: If not already installed, download the modified ISRI toolkit, 
 make and install the tools in /usr/local/bin.
 ```
 git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
 cd ~/ocr-evaluation-tools
 sudo make install
 ```
 ### Step 2: If not alrady installed, Build tesseract.
 ## Testing for Fraktur - frk and script/Fraktur
 ### Step 3: download the images and groundtruth
 ```
 mkdir -p ~/lang-downloads
 cd ~/lang-downloads
 wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
 wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
 ```
 ### Step 4: extract the files. 
 It doesn't really matter where in your filesystem you put them, 
 but they must go under a common root, for example, ~/lang-files
 ```
 mkdir -p ~/lang-files
 cd ~/lang-files
 unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
 unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
 mkdir -p ./frk-ligatures
 cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
 cp ./frk/gt/*.txt ./frk-ligatures/
 cd ./frk-ligatures/
 ls -1 *.tif >pages
 sed -i -e 's/.tif//g' pages
 cat pages
 ```
 ```
 mkdir -p ~/lang-stopwords
 cd ~/lang-stopwords
 wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
 ```
 Edit ~/lang-files/stopwords/frk.stopwords.txt as 
 wordacc uses a space delimited stopwords file, not line delimited.
 ```
 sed -i -e 's/\n/ /g' frk.stopwords.txt
 cat frk.stopwords.txt
 ```
 ### Step 5: run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
 ```
 cd ~/tesseract
 langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
 langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
 langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
 langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
 langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-Fraktur /home/ubuntu/tessdata_frk/frk-finetune-impact frk
 langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-frk /home/ubuntu/tessdata_frk/frk-finetune-frk frk
 ```
 and go to the gym, have lunch etc. It takes a while to run.
 ### Step 6: There should be a RELEASE.summary file
 *langtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
 ```
 #### Notes from Nick White regarding wordacc
 If you just want to remove all lines which have 100% recognition,
 you can add a 'awk' command like this:
 ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'  
 results.txt
 or if you've already got a results file you want to change, you can do this:
 awk '$3 != 100 {print $0}'  results.txt  newresults.txt
 If you only want the last sections where things are broken down by
 word, you can add a sed commend, like this:
 ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^   Count   Missed %Right   $/,$ 
 !d' | awk '$3 != 100 {print $0}'  results.txt
--- a/langtests/counttestset.sh
+++ b/langtests/counttestset.sh
@ -0,0 +1,52 @@
 #!/bin/bash
 # File:        counttestset.sh
 # Description: Script to count the errors on a single UNLV set.
 # Author:      Ray Smith
 # Created:     Wed Jun 13 11:58:01 PDT 2007
 #
 # (C) Copyright 2007, Google Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 if [ $# -ne 2 ]
 then
  echo "Usage:$0 pagesfile langcode"
  exit 1
 fi
 pages=$1
 langcode=$2
 imdir=${pages%/pages}
 setname=${imdir##*/}
 resdir=langtests/results/$setname
 mkdir -p langtests/reports
 echo "Counting on set $setname in directory $imdir to $resdir"
 accfiles=""
 wafiles=""
 while read page dir
 do
  if [ "$dir" ]
  then
     srcdir="$imdir/$dir"
  else
     srcdir="$imdir"
  fi
  echo "$srcdir/$page.tif"
  # Count character errors.
  ocrevalutf8  accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
  accfiles="$accfiles $resdir/$page.acc"
  # Count word errors.
  ocrevalutf8   wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
  wafiles="$wafiles $resdir/$page.wa"
 done <"$pages"
 accsum $accfiles >"langtests/results/$setname.characc"
 wordaccsum $wafiles >"langtests/results/$setname.wordacc"
--- a/langtests/frk_setup.sh
+++ b/langtests/frk_setup.sh
@ -0,0 +1,24 @@
 #!/bin/bash
 #
 mkdir -p ~/lang-downloads
 cd ~/lang-downloads
 wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
 wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
 mkdir -p ~/lang-files
 cd ~/lang-files
 unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
 unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
 mkdir -p ./frk-ligatures
 cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
 cp ./frk/gt/*.txt ./frk-ligatures/
 cd ./frk-ligatures/
 ls -1 *.tif >pages
 sed -i -e 's/.tif//g' pages
 mkdir -p ~/lang-stopwords
 cd ~/lang-stopwords
 wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
 echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."
--- a/langtests/frk_test.sh
+++ b/langtests/frk_test.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 #
 # run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
 cd ~/tesseract
 langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
 langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
 langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
 langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
 ### It takes a while to run.
--- a/langtests/reports/4_best_frk.summary
+++ b/langtests/reports/4_best_frk.summary
@ -0,0 +1,2 @@
 RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
 4_best_frk	frk-ligatures	178		94.73%		100		81.31%		74			75.17		94.29s
--- a/langtests/reports/4_best_int_frk.summary
+++ b/langtests/reports/4_best_int_frk.summary
@ -0,0 +1,2 @@
 RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
 4_best_int_frk	frk-ligatures	244		92.78%		109		79.63%		80			73.15		89.80s
--- a/langtests/reports/4_fast_Fraktur.summary
+++ b/langtests/reports/4_fast_Fraktur.summary
@ -0,0 +1,2 @@
 RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
 4_fast_Fraktur	frk-ligatures	265		92.16%		116		78.32%		82			72.48		91.29s
--- a/langtests/reports/4_fast_frk.summary
+++ b/langtests/reports/4_fast_frk.summary
@ -0,0 +1,2 @@
 RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy	NonStopWErrors	Accuracy	TimeTaken
 4_fast_frk	frk-ligatures	244		92.78%		109		79.63%		80			73.15		89.98s
--- a/langtests/runlangtests.sh
+++ b/langtests/runlangtests.sh
@ -0,0 +1,100 @@
 #!/bin/bash
 ##############################################################################
 # File:        runalltests_spa.sh
 # Description: Script to run a set of UNLV test sets for Spanish.
 #                      based on runalltests.sh by Ray Smith
 # Author:      Shree Devi Kumar
 # Created:     June 09, 2018
 #
 # (C) Copyright 2007, Google Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ##############################################################################
 if [ $# -ne 4 ]
 then
   echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode"
   exit 1
 fi
 tessdata=$3
 lang=$4
 #timesum computes the total cpu time
 timesum() {
 awk ' BEGIN {
 total = 0.0;
 }
 {
  total += $2;
 }
 END {
  printf("%.2f\n", total);
 }' "$1"
 }
 imdir="$1"
 vid="$2"
 bindir=${0%/*}
 if [ "$bindir" = "$0" ]
 then
    bindir="./"
 fi
 rdir=langtests/reports
 if [ "$lang" = "frk" ] ||  [ "$lang" = "Fraktur" ]
    then
       testsets="frk-ligatures"
 fi
 totalerrs=0
 totalwerrs=0
 totalnswerrs=0
 for set in $testsets
 do
    resdir=langtests/results/$set
    mkdir -p "$resdir"
    cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
    if [ -r "$imdir/$set/pages" ]
    then
 	# Run tesseract on all the pages.
 	$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" $lang
 	# Count the errors on all the pages.
 	$bindir/counttestset.sh "$imdir/$set/pages" $lang
 	# Get the new character word and nonstop word errors and accuracy.
 	cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
 	chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
 	wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
 	wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]')
 	nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
 	    cut -c10-17 |tr -d '[:blank:]')
 	nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
 	    cut -c19-26 |tr -d '[:blank:]')
    sumfile=$rdir/$vid.$set.sum
        if [ -r "langtests/results/$set.times" ]
        then
          total_time=$(timesum "langtests/results/$set.times")
      	else
          total_time='0.0'
        fi
        echo "RELEASE		TestSet	CharErrors	Accuracy	WordErrors	Accuracy\
 	NonStopWErrors	Accuracy	TimeTaken">"$sumfile"
        echo "$vid	$set	$cherrs		$chacc		$wderrs		$wdacc\
 		$nswderrs			$nswdacc		${total_time}s" >>"$sumfile"
    fi
 done
 cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
 mv "$rdir/$vid".*.sum langtests/results/
 cat "$rdir/$vid".summary
--- a/langtests/runtestset.sh
+++ b/langtests/runtestset.sh
@ -0,0 +1,60 @@
 #!/bin/bash
 # File:        runtestset.sh
 # Description: Script to run tesseract on a single UNLV set.
 # Author:      Ray Smith
 # Created:     Wed Jun 13 10:13:01 PDT 2007
 #
 # (C) Copyright 2007, Google Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 if  [ $# -ne 3 ] 
 then
  echo "Usage:$0 pagesfile tessdata-dir langcode "
  exit 1
 fi
 tess="time -f %U -o times.txt ./src/api/tesseract"
 tessdata=$2
 langcode=$3
 pages=$1
 imdir=${pages%/pages}
 setname=${imdir##*/}
 config=""
 resdir=langtests/results/$setname
 echo -e "Testing on set $setname in directory $imdir to $resdir\n"
 mkdir -p "$resdir"
 rm -f "langtests/results/$setname.times"
 while read page dir
 do
  # A pages file may be a list of files with subdirs or maybe just
  # a plain list of files so accommodate both.
  if [ "$dir" ]
  then
     srcdir="$imdir/$dir"
  else
     srcdir="$imdir"
  fi
  echo "$srcdir/$page.tif"
  $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
  if [ -r times.txt ]
  then
    read t <times.txt
    echo "$page $t" >>"langtests/results/$setname.times"
    echo -e "\033M$page $t"
    if [ "$t" = "Command terminated by signal 2" ]
    then
      exit 0
    fi
  fi
 done <"$pages"
		`@ -0,0 +1,2 @@`
							`RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken`
							`4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s`
		`@ -0,0 +1,2 @@`
							`RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken`
							`4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.80s`
		`@ -0,0 +1,2 @@`
							`RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken`
							`4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s`
		`@ -0,0 +1,2 @@`
							`RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken`
							`4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s`