Merge pull request #1650 from Shreeshrii/master

Add Spanish UNLV tests, use ocreval tools from /usr/local/bin
This commit is contained in:
Egor Pugin 2018-06-09 10:59:50 +03:00 committed by GitHub
commit 3f725dd92e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 90 additions and 46 deletions

View File

@ -1,45 +1,70 @@
How to run UNLV tests. ## How to run UNLV tests.
The scripts in this directory make it possible to duplicate the tests The scripts in this directory make it possible to duplicate the tests
published in the Fourth Annual Test of OCR Accuracy. published in the Fourth Annual Test of OCR Accuracy.
See http://www.isri.unlv.edu/downloads/AT-1995.pdf See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf
but first you have to get the tools and data used by UNLV: but first you have to get the tools and data used by UNLV:
Step 1: to download the images goto ### Step 1: to download the images go to
https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/ https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/
and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
spn.3B.tar.gz is incorrect in this repo, so get it from code.google
```
mkdir -p ~/isri-downloads mkdir -p ~/isri-downloads
cd ~/isri-downloads cd ~/isri-downloads
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
```
Step 2: extract the files. It doesn't really matter where ### Step 2: extract the files.
It doesn't really matter where
in your filesystem you put them, but they must go under a common in your filesystem you put them, but they must go under a common
root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example, root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example,
~/ISRI-OCRtk. ~/ISRI-OCRtk.
```
mkdir -p ~/ISRI-OCRtk mkdir -p ~/ISRI-OCRtk
cd ~/ISRI-OCRtk cd ~/ISRI-OCRtk
tar xzvf ~/isri-downloads/bus.3B.tar.gz tar xzvf ~/isri-downloads/bus.3B.tar.gz
tar xzvf ~/isri-downloads/doe3.3B.tar.gz tar xzvf ~/isri-downloads/doe3.3B.tar.gz
tar xzvf ~/isri-downloads/mag.3B.tar.gz tar xzvf ~/isri-downloads/mag.3B.tar.gz
tar xzvf ~/isri-downloads/news.3B.tar.gz tar xzvf ~/isri-downloads/news.3B.tar.gz
tar xzvf ~/isri-downloads/spn.3B.tar.gz
```
Step 4: Download the modified ISRI toolkit from: Edit *~/ISRI-OCRtk/spn.3B/pages*
https://ancientgreekocr.org/ocr-evaluation-tools.git delete the line containing the following imagename as it crashes tesseract.
make and install the tools in unlvtests/ocreval/bin by 7733_005.3B.tif
`make PREFIX=~/tesseract/unlvtests/ocreval install`
Step 6: cd back to your main tesseract-ocr dir and Build tesseract. ### Step 3: Download the modified ISRI toolkit, make and install the tools :
These will be installed in /usr/local/bin.
Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname: ```
unlvtests/runalltests.sh ~/ISRI-OCRtk tess4.0.0-beta.1 git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
and go to the gym, have lunch etc. cd ~/ocr-evaluation-tools
sudo make install
```
Step 8: There should be a file ### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
unlvtests/reports/tess4.0.0-beta.1.summary that contains the final summarized accuracy
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
```
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
```
and go to the gym, have lunch etc. It takes a while to run.
### Step 6: There should be a RELEASE.summary file
*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
report and comparison with the 1995 results. report and comparison with the 1995 results.
### Step 7: run the test for Spanish.
```
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
```

View File

@ -43,17 +43,17 @@ do
else else
srcdir="$imdir" srcdir="$imdir"
fi fi
echo "$srcdir/$page.tif" #echo "$srcdir/$page.tif"
# Count character errors. # Count character errors.
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.acc" ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc" accfiles="$accfiles $resdir/$page.acc"
# Count word errors. # Count word errors.
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.wa" ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
wafiles="$wafiles $resdir/$page.wa" wafiles="$wafiles $resdir/$page.wa"
done <"$pages" done <"$pages"
echo "$accfiles" #echo "$accfiles"
echo "$wafiles" #echo "$wafiles"
unlvtests/ocreval/bin/accsum $accfiles >"unlvtests/reports/$setname.characc" accsum $accfiles >"unlvtests/results/$setname.characc"
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordaccsum $wafiles >"unlvtests/reports/$setname.wordacc" wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"

View File

@ -0,0 +1 @@
1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED

View File

@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
if [ $# -ne 2 ] if [ $# -ne 4 ]
then then
echo "Usage:$0 unlv-data-dir version-id" echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
exit 1 exit 1
fi fi
if [ ! -d src/api ] if [ ! -d src/api ]
@ -30,7 +30,8 @@ then
echo "Please build tesseract before running $0" echo "Please build tesseract before running $0"
exit 1 exit 1
fi fi
tessdata=$3
lang=$4
#deltapc new old calculates the %change from old to new #deltapc new old calculates the %change from old to new
deltapc() { deltapc() {
@ -60,8 +61,20 @@ then
bindir="./" bindir="./"
fi fi
rdir=unlvtests/reports rdir=unlvtests/reports
#testsets="bus.3B doe3.3B mag.3B news.3B"
testsets="bus.3B" if [ "$lang" = "eng" ]
then
testsets="bus.3B doe3.3B mag.3B news.3B"
#testsets="bus.3B"
else
if [ "$lang" = "spa" ]
then
testsets="spn.3B"
else
echo "Language has to be eng or spa"
exit 1
fi
fi
totalerrs=0 totalerrs=0
totalwerrs=0 totalwerrs=0
@ -74,7 +87,7 @@ do
if [ -r "$imdir/$set/pages" ] if [ -r "$imdir/$set/pages" ]
then then
# Run tesseract on all the pages. # Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
# Count the errors on all the pages. # Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" $bindir/counttestset.sh "$imdir/$set/pages"
# Get the old character word and nonstop word errors. # Get the old character word and nonstop word errors.
@ -82,30 +95,30 @@ do
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum") oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum") oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
# Get the new character word and nonstop word errors and accuracy. # Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 | cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]') tr -d '[:blank:]')
chacc=$(head -5 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 | chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]') tr -d '[:blank:]')
wderrs=$(head -4 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 | wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]') tr -d '[:blank:]')
wdacc=$(head -5 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 | wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]') tr -d '[:blank:]')
nswderrs=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 | nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]') cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 | nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]') cut -c19-26 |tr -d '[:blank:]')
# Compute the percent change. # Compute the percent change.
chdelta=$(deltapc "$cherrs" "$olderrs") chdelta=$(deltapc "$cherrs" "$olderrs")
wdelta=$(deltapc "$wderrs" "$oldwerrs") wdelta=$(deltapc "$wderrs" "$oldwerrs")
nswdelta=$(deltapc "$nswderrs" "$oldnswerrs") nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
sumfile=$rdir/$vid.$set.sum sumfile=$rdir/$vid.$set.sum
if [ -r "unlvtests/reports/$set.times" ] if [ -r "unlvtests/results/$set.times" ]
then then
total_time=$(timesum "unlvtests/reports/$set.times") total_time=$(timesum "unlvtests/results/$set.times")
if [ -r "unlvtests/reports/prev/$set.times" ] if [ -r "unlvtests/results/prev/$set.times" ]
then then
paste "unlvtests/reports/prev/$set.times" "unlvtests/reports/$set.times" | paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/reports/$set.timedelta" awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
fi fi
else else
total_time='0.0' total_time='0.0'
@ -129,3 +142,6 @@ tfile=$rdir/$vid.total.sum
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\ echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
- $wdelta% $totalnswerrs - $nswdelta%" >"$tfile" - $wdelta% $totalnswerrs - $nswdelta%" >"$tfile"
cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
mv "$rdir/$vid".*.sum unlvtests/results/
cat "$rdir/$vid".summary

View File

@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
if [ $# -ne 1 ] && [ $# -ne 2 ] if [ $# -ne 3 ] && [ $# -ne 4 ]
then then
echo "Usage:$0 pagesfile [-zoning]" echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
exit 1 exit 1
fi fi
if [ ! -d src/api ] if [ ! -d src/api ]
@ -36,13 +36,15 @@ then
fi fi
else else
tess="time -f %U -o times.txt src/api/tesseract" tess="time -f %U -o times.txt src/api/tesseract"
export TESSDATA_PREFIX=$PWD/ #tess="time -f %U -o times.txt tesseract"
fi fi
tessdata=$2
lang=$3
pages=$1 pages=$1
imdir=${pages%/pages} imdir=${pages%/pages}
setname=${imdir##*/} setname=${imdir##*/}
if [ $# -eq 2 ] && [ "$2" = "-zoning" ] if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
then then
config=unlv.auto config=unlv.auto
resdir=unlvtests/results/zoning.$setname resdir=unlvtests/results/zoning.$setname
@ -52,7 +54,7 @@ else
fi fi
echo -e "Testing on set $setname in directory $imdir to $resdir\n" echo -e "Testing on set $setname in directory $imdir to $resdir\n"
mkdir -p "$resdir" mkdir -p "$resdir"
rm -f "unlvtests/reports/$setname.times" rm -f "unlvtests/results/$setname.times"
while read page dir while read page dir
do do
# A pages file may be a list of files with subdirs or maybe just # A pages file may be a list of files with subdirs or maybe just
@ -64,11 +66,11 @@ do
srcdir="$imdir" srcdir="$imdir"
fi fi
# echo "$srcdir/$page.tif" # echo "$srcdir/$page.tif"
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir ../tessdata_fast --oem 1 -l eng --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
if [ -r times.txt ] if [ -r times.txt ]
then then
read t <times.txt read t <times.txt
echo "$page $t" >>"unlvtests/reports/$setname.times" echo "$page $t" >>"unlvtests/results/$setname.times"
echo -e "\033M$page $t" echo -e "\033M$page $t"
if [ "$t" = "Command terminated by signal 2" ] if [ "$t" = "Command terminated by signal 2" ]
then then