Merge pull request #1657 from Shreeshrii/master

update Spanish UNLV test, use spa.stopwords, iconv to UTF-8
This commit is contained in:
Egor Pugin 2018-06-09 18:31:35 +03:00 committed by GitHub
commit 37dadbe478
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 169 additions and 32 deletions

View File

@ -10,3 +10,6 @@ EXTRA_DIST += reports/1995.mag.3B.sum
EXTRA_DIST += reports/1995.news.3B.sum
EXTRA_DIST += reports/2.03.summary
EXTRA_DIST += reports/2.04.summary
EXTRA_DIST += reports/4_best_spa.summary
EXTRA_DIST += reports/4_best_int_spa.summary
EXTRA_DIST += reports/4_fast_spa.summary

View File

@ -34,12 +34,18 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz
tar xzvf ~/isri-downloads/mag.3B.tar.gz
tar xzvf ~/isri-downloads/news.3B.tar.gz
tar xzvf ~/isri-downloads/spn.3B.tar.gz
mkdir -p stopwords
cd stopwords
wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
```
Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
wordacc uses a space delimited stopwords file, not line delimited.
s/\n/ /g
Edit *~/ISRI-OCRtk/spn.3B/pages*
delete the line containing the following imagename as it crashes tesseract.
Edit ~/ISRI-OCRtk/spn.3B/pages
Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717).
7733_005.3B.tif
7733_005.3B 3
### Step 3: Download the modified ISRI toolkit, make and install the tools :
These will be installed in /usr/local/bin.
@ -52,10 +58,10 @@ sudo make install
### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:
```
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
```
and go to the gym, have lunch etc. It takes a while to run.
@ -66,5 +72,23 @@ report and comparison with the 1995 results.
### Step 7: run the test for Spanish.
```
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
```
#### Notes from Nick White regarding wordacc
If you just want to remove all lines which have 100% recognition,
you can add a 'awk' command like this:
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
results.txt
or if you've already got a results file you want to change, you can do this:
awk '$3 != 100 {print $0}' results.txt newresults.txt
If you only want the last sections where things are broken down by
word, you can add a sed commend, like this:
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
!d' | awk '$3 != 100 {print $0}' results.txt

View File

@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 1 ]
if [ $# -ne 2 ]
then
echo "Usage:$0 pagesfile"
echo "Usage:$0 pagesfile langcode"
exit 1
fi
if [ ! -d src/api ]
@ -27,6 +27,7 @@ then
fi
pages=$1
langcode=$2
imdir=${pages%/pages}
setname=${imdir##*/}
@ -45,15 +46,22 @@ do
fi
#echo "$srcdir/$page.tif"
# Count character errors.
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.
ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
#langcode should be either eng or spa
if [ "$langcode" = "eng" ]
then
ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
else
cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
fi
wafiles="$wafiles $resdir/$page.wa"
done <"$pages"
#echo "$accfiles"
#echo "$wafiles"
accsum $accfiles >"unlvtests/results/$setname.characc"
wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"

View File

@ -1 +0,0 @@
1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED

View File

@ -0,0 +1,2 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
4_best_int_spa spn.3B 2846 99.18% 937 98.39% 739 97.54 6478.02s

View File

@ -0,0 +1,2 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
4_best_spa spn.3B 2823 99.19% 924 98.41% 729 97.57 7233.76s

View File

@ -0,0 +1,2 @@
RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken
4_fast_spa spn.3B 2841 99.18% 879 98.49% 742 97.53 3838.82s

View File

@ -1,6 +1,6 @@
#!/bin/bash
# File: runalltests.sh
# Description: Script to run a set of UNLV test sets.
# Description: Script to run a set of UNLV test sets for English.
# Author: Ray Smith
# Created: Thu Jun 14 08:21:01 PDT 2007
#
@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 4 ]
if [ $# -ne 3 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
exit 1
fi
if [ ! -d src/api ]
@ -31,7 +31,6 @@ then
exit 1
fi
tessdata=$3
lang=$4
#deltapc new old calculates the %change from old to new
deltapc() {
@ -62,19 +61,8 @@ then
fi
rdir=unlvtests/reports
if [ "$lang" = "eng" ]
then
testsets="bus.3B doe3.3B mag.3B news.3B"
#testsets="bus.3B"
else
if [ "$lang" = "spa" ]
then
testsets="spn.3B"
else
echo "Language has to be eng or spa"
exit 1
fi
fi
testsets="bus.3B doe3.3B mag.3B news.3B"
#testsets="bus.3B"
totalerrs=0
totalwerrs=0
@ -87,7 +75,7 @@ do
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages"
# Get the old character word and nonstop word errors.

109
unlvtests/runalltests_spa.sh Executable file
View File

@ -0,0 +1,109 @@
#!/bin/bash
##############################################################################
# File: runalltests_spa.sh
# Description: Script to run a set of UNLV test sets for Spanish.
# based on runalltests.sh by Ray Smith
# Author: Shree Devi Kumar
# Created: June 09, 2018
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
if [ $# -ne 3 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
exit 1
fi
if [ ! -d src/api ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
fi
tessdata=$3
lang=$4
#timesum computes the total cpu time
timesum() {
awk ' BEGIN {
total = 0.0;
}
{
total += $2;
}
END {
printf("%.2f\n", total);
}' "$1"
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=unlvtests/reports
testsets="spn.3B"
totalerrs=0
totalwerrs=0
totalnswerrs=0
for set in $testsets
do
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" "spa"
# Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]')
sumfile=$rdir/$vid.$set.sum
if [ -r "unlvtests/results/$set.times" ]
then
total_time=$(timesum "unlvtests/results/$set.times")
if [ -r "unlvtests/results/prev/$set.times" ]
then
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
fi
else
total_time='0.0'
fi
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
NonStopWordErrors Accuracy TimeTaken">"$sumfile"
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
fi
done
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
mv "$rdir/$vid".*.sum unlvtests/results/
cat "$rdir/$vid".summary