mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Add langtests framework with frk example
This commit is contained in:
parent
a849860c6e
commit
92922b421c
@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest
|
||||
|
||||
EXTRA_DIST = README.md\
|
||||
aclocal.m4 config configure.ac autogen.sh contrib \
|
||||
tesseract.pc.in $(TRAINING_SUBDIR) java doc unlvtests
|
||||
tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
|
||||
|
||||
DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)
|
||||
|
||||
|
@ -466,6 +466,7 @@ fi
|
||||
|
||||
# Output files
|
||||
AC_CONFIG_FILES([Makefile tesseract.pc])
|
||||
AC_CONFIG_FILES([langtests/Makefile])
|
||||
AC_CONFIG_FILES([src/api/Makefile])
|
||||
AC_CONFIG_FILES([src/api/tess_version.h])
|
||||
AC_CONFIG_FILES([src/arch/Makefile])
|
||||
|
2
langtests/.gitignore
vendored
Normal file
2
langtests/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
#
|
||||
results/*
|
8
langtests/Makefile.am
Normal file
8
langtests/Makefile.am
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
EXTRA_DIST = README.md
|
||||
EXTRA_DIST += frk_setup.sh
|
||||
EXTRA_DIST += frk_test.sh
|
||||
EXTRA_DIST += counttestset.sh
|
||||
EXTRA_DIST += runlangtests.sh
|
||||
EXTRA_DIST += runtestset.sh
|
||||
EXTRA_DIST += reports/*
|
98
langtests/README.md
Normal file
98
langtests/README.md
Normal file
@ -0,0 +1,98 @@
|
||||
# How to run Language tests.
|
||||
|
||||
The scripts in this directory make it possible to test Accuracy of Tesseract
|
||||
for different languages.
|
||||
|
||||
### Step 1: If not already installed, download the modified ISRI toolkit,
|
||||
make and install the tools in /usr/local/bin.
|
||||
|
||||
```
|
||||
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
|
||||
cd ~/ocr-evaluation-tools
|
||||
sudo make install
|
||||
```
|
||||
|
||||
### Step 2: If not alrady installed, Build tesseract.
|
||||
|
||||
## Testing for Fraktur - frk and script/Fraktur
|
||||
|
||||
### Step 3: download the images and groundtruth
|
||||
|
||||
```
|
||||
mkdir -p ~/lang-downloads
|
||||
cd ~/lang-downloads
|
||||
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
|
||||
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
|
||||
```
|
||||
|
||||
### Step 4: extract the files.
|
||||
It doesn't really matter where in your filesystem you put them,
|
||||
but they must go under a common root, for example, ~/lang-files
|
||||
|
||||
```
|
||||
mkdir -p ~/lang-files
|
||||
cd ~/lang-files
|
||||
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
|
||||
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
|
||||
mkdir -p ./frk-ligatures
|
||||
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
|
||||
cp ./frk/gt/*.txt ./frk-ligatures/
|
||||
|
||||
cd ./frk-ligatures/
|
||||
ls -1 *.tif >pages
|
||||
sed -i -e 's/.tif//g' pages
|
||||
cat pages
|
||||
```
|
||||
|
||||
```
|
||||
mkdir -p ~/lang-stopwords
|
||||
cd ~/lang-stopwords
|
||||
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
|
||||
```
|
||||
Edit ~/lang-files/stopwords/frk.stopwords.txt as
|
||||
wordacc uses a space delimited stopwords file, not line delimited.
|
||||
|
||||
```
|
||||
sed -i -e 's/\n/ /g' frk.stopwords.txt
|
||||
cat frk.stopwords.txt
|
||||
```
|
||||
|
||||
### Step 5: run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
|
||||
|
||||
```
|
||||
cd ~/tesseract
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
|
||||
|
||||
|
||||
|
||||
|
||||
langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-Fraktur /home/ubuntu/tessdata_frk/frk-finetune-impact frk
|
||||
langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-frk /home/ubuntu/tessdata_frk/frk-finetune-frk frk
|
||||
```
|
||||
and go to the gym, have lunch etc. It takes a while to run.
|
||||
|
||||
### Step 6: There should be a RELEASE.summary file
|
||||
*langtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
|
||||
|
||||
```
|
||||
|
||||
#### Notes from Nick White regarding wordacc
|
||||
|
||||
If you just want to remove all lines which have 100% recognition,
|
||||
you can add a 'awk' command like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
|
||||
results.txt
|
||||
|
||||
or if you've already got a results file you want to change, you can do this:
|
||||
|
||||
awk '$3 != 100 {print $0}' results.txt newresults.txt
|
||||
|
||||
If you only want the last sections where things are broken down by
|
||||
word, you can add a sed commend, like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
|
||||
!d' | awk '$3 != 100 {print $0}' results.txt
|
52
langtests/counttestset.sh
Executable file
52
langtests/counttestset.sh
Executable file
@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
# File: counttestset.sh
|
||||
# Description: Script to count the errors on a single UNLV set.
|
||||
# Author: Ray Smith
|
||||
# Created: Wed Jun 13 11:58:01 PDT 2007
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 2 ]
|
||||
then
|
||||
echo "Usage:$0 pagesfile langcode"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
pages=$1
|
||||
langcode=$2
|
||||
|
||||
imdir=${pages%/pages}
|
||||
setname=${imdir##*/}
|
||||
resdir=langtests/results/$setname
|
||||
mkdir -p langtests/reports
|
||||
echo "Counting on set $setname in directory $imdir to $resdir"
|
||||
accfiles=""
|
||||
wafiles=""
|
||||
while read page dir
|
||||
do
|
||||
if [ "$dir" ]
|
||||
then
|
||||
srcdir="$imdir/$dir"
|
||||
else
|
||||
srcdir="$imdir"
|
||||
fi
|
||||
echo "$srcdir/$page.tif"
|
||||
# Count character errors.
|
||||
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
|
||||
accfiles="$accfiles $resdir/$page.acc"
|
||||
# Count word errors.
|
||||
ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
|
||||
wafiles="$wafiles $resdir/$page.wa"
|
||||
done <"$pages"
|
||||
|
||||
accsum $accfiles >"langtests/results/$setname.characc"
|
||||
wordaccsum $wafiles >"langtests/results/$setname.wordacc"
|
24
langtests/frk_setup.sh
Normal file
24
langtests/frk_setup.sh
Normal file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
mkdir -p ~/lang-downloads
|
||||
cd ~/lang-downloads
|
||||
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
|
||||
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
|
||||
|
||||
mkdir -p ~/lang-files
|
||||
cd ~/lang-files
|
||||
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
|
||||
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
|
||||
mkdir -p ./frk-ligatures
|
||||
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
|
||||
cp ./frk/gt/*.txt ./frk-ligatures/
|
||||
|
||||
cd ./frk-ligatures/
|
||||
ls -1 *.tif >pages
|
||||
sed -i -e 's/.tif//g' pages
|
||||
|
||||
mkdir -p ~/lang-stopwords
|
||||
cd ~/lang-stopwords
|
||||
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
|
||||
|
||||
echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."
|
13
langtests/frk_test.sh
Normal file
13
langtests/frk_test.sh
Normal file
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
|
||||
|
||||
cd ~/tesseract
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
|
||||
|
||||
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
|
||||
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
|
||||
|
||||
### It takes a while to run.
|
||||
|
2
langtests/reports/4_best_frk.summary
Normal file
2
langtests/reports/4_best_frk.summary
Normal file
@ -0,0 +1,2 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s
|
2
langtests/reports/4_best_int_frk.summary
Normal file
2
langtests/reports/4_best_int_frk.summary
Normal file
@ -0,0 +1,2 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.80s
|
2
langtests/reports/4_fast_Fraktur.summary
Normal file
2
langtests/reports/4_fast_Fraktur.summary
Normal file
@ -0,0 +1,2 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s
|
2
langtests/reports/4_fast_frk.summary
Normal file
2
langtests/reports/4_fast_frk.summary
Normal file
@ -0,0 +1,2 @@
|
||||
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||
4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s
|
100
langtests/runlangtests.sh
Executable file
100
langtests/runlangtests.sh
Executable file
@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
##############################################################################
|
||||
# File: runalltests_spa.sh
|
||||
# Description: Script to run a set of UNLV test sets for Spanish.
|
||||
# based on runalltests.sh by Ray Smith
|
||||
# Author: Shree Devi Kumar
|
||||
# Created: June 09, 2018
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
##############################################################################
|
||||
if [ $# -ne 4 ]
|
||||
then
|
||||
echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tessdata=$3
|
||||
lang=$4
|
||||
|
||||
#timesum computes the total cpu time
|
||||
timesum() {
|
||||
awk ' BEGIN {
|
||||
total = 0.0;
|
||||
}
|
||||
{
|
||||
total += $2;
|
||||
}
|
||||
END {
|
||||
printf("%.2f\n", total);
|
||||
}' "$1"
|
||||
}
|
||||
|
||||
imdir="$1"
|
||||
vid="$2"
|
||||
bindir=${0%/*}
|
||||
if [ "$bindir" = "$0" ]
|
||||
then
|
||||
bindir="./"
|
||||
fi
|
||||
rdir=langtests/reports
|
||||
if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ]
|
||||
then
|
||||
testsets="frk-ligatures"
|
||||
fi
|
||||
|
||||
totalerrs=0
|
||||
totalwerrs=0
|
||||
totalnswerrs=0
|
||||
for set in $testsets
|
||||
do
|
||||
resdir=langtests/results/$set
|
||||
mkdir -p "$resdir"
|
||||
cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
|
||||
if [ -r "$imdir/$set/pages" ]
|
||||
then
|
||||
# Run tesseract on all the pages.
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" $lang
|
||||
# Count the errors on all the pages.
|
||||
$bindir/counttestset.sh "$imdir/$set/pages" $lang
|
||||
# Get the new character word and nonstop word errors and accuracy.
|
||||
cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c10-17 |tr -d '[:blank:]')
|
||||
nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c19-26 |tr -d '[:blank:]')
|
||||
|
||||
sumfile=$rdir/$vid.$set.sum
|
||||
if [ -r "langtests/results/$set.times" ]
|
||||
then
|
||||
total_time=$(timesum "langtests/results/$set.times")
|
||||
else
|
||||
total_time='0.0'
|
||||
fi
|
||||
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
|
||||
NonStopWErrors Accuracy TimeTaken">"$sumfile"
|
||||
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
|
||||
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
|
||||
fi
|
||||
done
|
||||
|
||||
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
||||
|
||||
mv "$rdir/$vid".*.sum langtests/results/
|
||||
cat "$rdir/$vid".summary
|
60
langtests/runtestset.sh
Executable file
60
langtests/runtestset.sh
Executable file
@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
# File: runtestset.sh
|
||||
# Description: Script to run tesseract on a single UNLV set.
|
||||
# Author: Ray Smith
|
||||
# Created: Wed Jun 13 10:13:01 PDT 2007
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 3 ]
|
||||
then
|
||||
echo "Usage:$0 pagesfile tessdata-dir langcode "
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tess="time -f %U -o times.txt ./src/api/tesseract"
|
||||
|
||||
tessdata=$2
|
||||
langcode=$3
|
||||
pages=$1
|
||||
imdir=${pages%/pages}
|
||||
setname=${imdir##*/}
|
||||
|
||||
config=""
|
||||
resdir=langtests/results/$setname
|
||||
|
||||
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
|
||||
mkdir -p "$resdir"
|
||||
rm -f "langtests/results/$setname.times"
|
||||
while read page dir
|
||||
do
|
||||
# A pages file may be a list of files with subdirs or maybe just
|
||||
# a plain list of files so accommodate both.
|
||||
if [ "$dir" ]
|
||||
then
|
||||
srcdir="$imdir/$dir"
|
||||
else
|
||||
srcdir="$imdir"
|
||||
fi
|
||||
echo "$srcdir/$page.tif"
|
||||
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
|
||||
if [ -r times.txt ]
|
||||
then
|
||||
read t <times.txt
|
||||
echo "$page $t" >>"langtests/results/$setname.times"
|
||||
echo -e "\033M$page $t"
|
||||
if [ "$t" = "Command terminated by signal 2" ]
|
||||
then
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
done <"$pages"
|
Loading…
Reference in New Issue
Block a user