mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Add langtests framework with frk example
This commit is contained in:
parent
a849860c6e
commit
92922b421c
@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest
|
|||||||
|
|
||||||
EXTRA_DIST = README.md\
|
EXTRA_DIST = README.md\
|
||||||
aclocal.m4 config configure.ac autogen.sh contrib \
|
aclocal.m4 config configure.ac autogen.sh contrib \
|
||||||
tesseract.pc.in $(TRAINING_SUBDIR) java doc unlvtests
|
tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests
|
||||||
|
|
||||||
DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)
|
DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR)
|
||||||
|
|
||||||
|
@ -466,6 +466,7 @@ fi
|
|||||||
|
|
||||||
# Output files
|
# Output files
|
||||||
AC_CONFIG_FILES([Makefile tesseract.pc])
|
AC_CONFIG_FILES([Makefile tesseract.pc])
|
||||||
|
AC_CONFIG_FILES([langtests/Makefile])
|
||||||
AC_CONFIG_FILES([src/api/Makefile])
|
AC_CONFIG_FILES([src/api/Makefile])
|
||||||
AC_CONFIG_FILES([src/api/tess_version.h])
|
AC_CONFIG_FILES([src/api/tess_version.h])
|
||||||
AC_CONFIG_FILES([src/arch/Makefile])
|
AC_CONFIG_FILES([src/arch/Makefile])
|
||||||
|
2
langtests/.gitignore
vendored
Normal file
2
langtests/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#
|
||||||
|
results/*
|
8
langtests/Makefile.am
Normal file
8
langtests/Makefile.am
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
EXTRA_DIST = README.md
|
||||||
|
EXTRA_DIST += frk_setup.sh
|
||||||
|
EXTRA_DIST += frk_test.sh
|
||||||
|
EXTRA_DIST += counttestset.sh
|
||||||
|
EXTRA_DIST += runlangtests.sh
|
||||||
|
EXTRA_DIST += runtestset.sh
|
||||||
|
EXTRA_DIST += reports/*
|
98
langtests/README.md
Normal file
98
langtests/README.md
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
# How to run Language tests.
|
||||||
|
|
||||||
|
The scripts in this directory make it possible to test Accuracy of Tesseract
|
||||||
|
for different languages.
|
||||||
|
|
||||||
|
### Step 1: If not already installed, download the modified ISRI toolkit,
|
||||||
|
make and install the tools in /usr/local/bin.
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
|
||||||
|
cd ~/ocr-evaluation-tools
|
||||||
|
sudo make install
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: If not alrady installed, Build tesseract.
|
||||||
|
|
||||||
|
## Testing for Fraktur - frk and script/Fraktur
|
||||||
|
|
||||||
|
### Step 3: download the images and groundtruth
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir -p ~/lang-downloads
|
||||||
|
cd ~/lang-downloads
|
||||||
|
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
|
||||||
|
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: extract the files.
|
||||||
|
It doesn't really matter where in your filesystem you put them,
|
||||||
|
but they must go under a common root, for example, ~/lang-files
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir -p ~/lang-files
|
||||||
|
cd ~/lang-files
|
||||||
|
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
|
||||||
|
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
|
||||||
|
mkdir -p ./frk-ligatures
|
||||||
|
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
|
||||||
|
cp ./frk/gt/*.txt ./frk-ligatures/
|
||||||
|
|
||||||
|
cd ./frk-ligatures/
|
||||||
|
ls -1 *.tif >pages
|
||||||
|
sed -i -e 's/.tif//g' pages
|
||||||
|
cat pages
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir -p ~/lang-stopwords
|
||||||
|
cd ~/lang-stopwords
|
||||||
|
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
|
||||||
|
```
|
||||||
|
Edit ~/lang-files/stopwords/frk.stopwords.txt as
|
||||||
|
wordacc uses a space delimited stopwords file, not line delimited.
|
||||||
|
|
||||||
|
```
|
||||||
|
sed -i -e 's/\n/ /g' frk.stopwords.txt
|
||||||
|
cat frk.stopwords.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd ~/tesseract
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-Fraktur /home/ubuntu/tessdata_frk/frk-finetune-impact frk
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_shreetest_frk-frk /home/ubuntu/tessdata_frk/frk-finetune-frk frk
|
||||||
|
```
|
||||||
|
and go to the gym, have lunch etc. It takes a while to run.
|
||||||
|
|
||||||
|
### Step 6: There should be a RELEASE.summary file
|
||||||
|
*langtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Notes from Nick White regarding wordacc
|
||||||
|
|
||||||
|
If you just want to remove all lines which have 100% recognition,
|
||||||
|
you can add a 'awk' command like this:
|
||||||
|
|
||||||
|
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
|
||||||
|
results.txt
|
||||||
|
|
||||||
|
or if you've already got a results file you want to change, you can do this:
|
||||||
|
|
||||||
|
awk '$3 != 100 {print $0}' results.txt newresults.txt
|
||||||
|
|
||||||
|
If you only want the last sections where things are broken down by
|
||||||
|
word, you can add a sed commend, like this:
|
||||||
|
|
||||||
|
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
|
||||||
|
!d' | awk '$3 != 100 {print $0}' results.txt
|
52
langtests/counttestset.sh
Executable file
52
langtests/counttestset.sh
Executable file
@ -0,0 +1,52 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# File: counttestset.sh
|
||||||
|
# Description: Script to count the errors on a single UNLV set.
|
||||||
|
# Author: Ray Smith
|
||||||
|
# Created: Wed Jun 13 11:58:01 PDT 2007
|
||||||
|
#
|
||||||
|
# (C) Copyright 2007, Google Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
if [ $# -ne 2 ]
|
||||||
|
then
|
||||||
|
echo "Usage:$0 pagesfile langcode"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
pages=$1
|
||||||
|
langcode=$2
|
||||||
|
|
||||||
|
imdir=${pages%/pages}
|
||||||
|
setname=${imdir##*/}
|
||||||
|
resdir=langtests/results/$setname
|
||||||
|
mkdir -p langtests/reports
|
||||||
|
echo "Counting on set $setname in directory $imdir to $resdir"
|
||||||
|
accfiles=""
|
||||||
|
wafiles=""
|
||||||
|
while read page dir
|
||||||
|
do
|
||||||
|
if [ "$dir" ]
|
||||||
|
then
|
||||||
|
srcdir="$imdir/$dir"
|
||||||
|
else
|
||||||
|
srcdir="$imdir"
|
||||||
|
fi
|
||||||
|
echo "$srcdir/$page.tif"
|
||||||
|
# Count character errors.
|
||||||
|
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc"
|
||||||
|
accfiles="$accfiles $resdir/$page.acc"
|
||||||
|
# Count word errors.
|
||||||
|
ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa"
|
||||||
|
wafiles="$wafiles $resdir/$page.wa"
|
||||||
|
done <"$pages"
|
||||||
|
|
||||||
|
accsum $accfiles >"langtests/results/$setname.characc"
|
||||||
|
wordaccsum $wafiles >"langtests/results/$setname.wordacc"
|
24
langtests/frk_setup.sh
Normal file
24
langtests/frk_setup.sh
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
mkdir -p ~/lang-downloads
|
||||||
|
cd ~/lang-downloads
|
||||||
|
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
|
||||||
|
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
|
||||||
|
|
||||||
|
mkdir -p ~/lang-files
|
||||||
|
cd ~/lang-files
|
||||||
|
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
|
||||||
|
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
|
||||||
|
mkdir -p ./frk-ligatures
|
||||||
|
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
|
||||||
|
cp ./frk/gt/*.txt ./frk-ligatures/
|
||||||
|
|
||||||
|
cd ./frk-ligatures/
|
||||||
|
ls -1 *.tif >pages
|
||||||
|
sed -i -e 's/.tif//g' pages
|
||||||
|
|
||||||
|
mkdir -p ~/lang-stopwords
|
||||||
|
cd ~/lang-stopwords
|
||||||
|
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
|
||||||
|
|
||||||
|
echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."
|
13
langtests/frk_test.sh
Normal file
13
langtests/frk_test.sh
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code:
|
||||||
|
|
||||||
|
cd ~/tesseract
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur
|
||||||
|
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk
|
||||||
|
langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk
|
||||||
|
|
||||||
|
### It takes a while to run.
|
||||||
|
|
2
langtests/reports/4_best_frk.summary
Normal file
2
langtests/reports/4_best_frk.summary
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||||
|
4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s
|
2
langtests/reports/4_best_int_frk.summary
Normal file
2
langtests/reports/4_best_int_frk.summary
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||||
|
4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.80s
|
2
langtests/reports/4_fast_Fraktur.summary
Normal file
2
langtests/reports/4_fast_Fraktur.summary
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||||
|
4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s
|
2
langtests/reports/4_fast_frk.summary
Normal file
2
langtests/reports/4_fast_frk.summary
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken
|
||||||
|
4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s
|
100
langtests/runlangtests.sh
Executable file
100
langtests/runlangtests.sh
Executable file
@ -0,0 +1,100 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
##############################################################################
|
||||||
|
# File: runalltests_spa.sh
|
||||||
|
# Description: Script to run a set of UNLV test sets for Spanish.
|
||||||
|
# based on runalltests.sh by Ray Smith
|
||||||
|
# Author: Shree Devi Kumar
|
||||||
|
# Created: June 09, 2018
|
||||||
|
#
|
||||||
|
# (C) Copyright 2007, Google Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
##############################################################################
|
||||||
|
if [ $# -ne 4 ]
|
||||||
|
then
|
||||||
|
echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
tessdata=$3
|
||||||
|
lang=$4
|
||||||
|
|
||||||
|
#timesum computes the total cpu time
|
||||||
|
timesum() {
|
||||||
|
awk ' BEGIN {
|
||||||
|
total = 0.0;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
total += $2;
|
||||||
|
}
|
||||||
|
END {
|
||||||
|
printf("%.2f\n", total);
|
||||||
|
}' "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
imdir="$1"
|
||||||
|
vid="$2"
|
||||||
|
bindir=${0%/*}
|
||||||
|
if [ "$bindir" = "$0" ]
|
||||||
|
then
|
||||||
|
bindir="./"
|
||||||
|
fi
|
||||||
|
rdir=langtests/reports
|
||||||
|
if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ]
|
||||||
|
then
|
||||||
|
testsets="frk-ligatures"
|
||||||
|
fi
|
||||||
|
|
||||||
|
totalerrs=0
|
||||||
|
totalwerrs=0
|
||||||
|
totalnswerrs=0
|
||||||
|
for set in $testsets
|
||||||
|
do
|
||||||
|
resdir=langtests/results/$set
|
||||||
|
mkdir -p "$resdir"
|
||||||
|
cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
|
||||||
|
if [ -r "$imdir/$set/pages" ]
|
||||||
|
then
|
||||||
|
# Run tesseract on all the pages.
|
||||||
|
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" $lang
|
||||||
|
# Count the errors on all the pages.
|
||||||
|
$bindir/counttestset.sh "$imdir/$set/pages" $lang
|
||||||
|
# Get the new character word and nonstop word errors and accuracy.
|
||||||
|
cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]')
|
||||||
|
chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]')
|
||||||
|
wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]')
|
||||||
|
wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]')
|
||||||
|
nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||||
|
cut -c10-17 |tr -d '[:blank:]')
|
||||||
|
nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||||
|
cut -c19-26 |tr -d '[:blank:]')
|
||||||
|
|
||||||
|
sumfile=$rdir/$vid.$set.sum
|
||||||
|
if [ -r "langtests/results/$set.times" ]
|
||||||
|
then
|
||||||
|
total_time=$(timesum "langtests/results/$set.times")
|
||||||
|
else
|
||||||
|
total_time='0.0'
|
||||||
|
fi
|
||||||
|
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
|
||||||
|
NonStopWErrors Accuracy TimeTaken">"$sumfile"
|
||||||
|
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
|
||||||
|
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
||||||
|
|
||||||
|
mv "$rdir/$vid".*.sum langtests/results/
|
||||||
|
cat "$rdir/$vid".summary
|
60
langtests/runtestset.sh
Executable file
60
langtests/runtestset.sh
Executable file
@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# File: runtestset.sh
|
||||||
|
# Description: Script to run tesseract on a single UNLV set.
|
||||||
|
# Author: Ray Smith
|
||||||
|
# Created: Wed Jun 13 10:13:01 PDT 2007
|
||||||
|
#
|
||||||
|
# (C) Copyright 2007, Google Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
if [ $# -ne 3 ]
|
||||||
|
then
|
||||||
|
echo "Usage:$0 pagesfile tessdata-dir langcode "
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
tess="time -f %U -o times.txt ./src/api/tesseract"
|
||||||
|
|
||||||
|
tessdata=$2
|
||||||
|
langcode=$3
|
||||||
|
pages=$1
|
||||||
|
imdir=${pages%/pages}
|
||||||
|
setname=${imdir##*/}
|
||||||
|
|
||||||
|
config=""
|
||||||
|
resdir=langtests/results/$setname
|
||||||
|
|
||||||
|
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
|
||||||
|
mkdir -p "$resdir"
|
||||||
|
rm -f "langtests/results/$setname.times"
|
||||||
|
while read page dir
|
||||||
|
do
|
||||||
|
# A pages file may be a list of files with subdirs or maybe just
|
||||||
|
# a plain list of files so accommodate both.
|
||||||
|
if [ "$dir" ]
|
||||||
|
then
|
||||||
|
srcdir="$imdir/$dir"
|
||||||
|
else
|
||||||
|
srcdir="$imdir"
|
||||||
|
fi
|
||||||
|
echo "$srcdir/$page.tif"
|
||||||
|
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
|
||||||
|
if [ -r times.txt ]
|
||||||
|
then
|
||||||
|
read t <times.txt
|
||||||
|
echo "$page $t" >>"langtests/results/$setname.times"
|
||||||
|
echo -e "\033M$page $t"
|
||||||
|
if [ "$t" = "Command terminated by signal 2" ]
|
||||||
|
then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done <"$pages"
|
Loading…
Reference in New Issue
Block a user