mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-13 07:59:04 +08:00
Merge pull request #1650 from Shreeshrii/master
Add Spanish UNLV tests, use ocreval tools from /usr/local/bin
This commit is contained in:
commit
3f725dd92e
@ -1,45 +1,70 @@
|
|||||||
How to run UNLV tests.
|
## How to run UNLV tests.
|
||||||
|
|
||||||
The scripts in this directory make it possible to duplicate the tests
|
The scripts in this directory make it possible to duplicate the tests
|
||||||
published in the Fourth Annual Test of OCR Accuracy.
|
published in the Fourth Annual Test of OCR Accuracy.
|
||||||
See http://www.isri.unlv.edu/downloads/AT-1995.pdf
|
See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf
|
||||||
but first you have to get the tools and data used by UNLV:
|
but first you have to get the tools and data used by UNLV:
|
||||||
|
|
||||||
Step 1: to download the images goto
|
### Step 1: to download the images go to
|
||||||
https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/
|
https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/
|
||||||
and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
|
and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz
|
||||||
|
spn.3B.tar.gz is incorrect in this repo, so get it from code.google
|
||||||
|
|
||||||
|
```
|
||||||
mkdir -p ~/isri-downloads
|
mkdir -p ~/isri-downloads
|
||||||
cd ~/isri-downloads
|
cd ~/isri-downloads
|
||||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
|
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz
|
||||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
|
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz
|
||||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
|
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz
|
||||||
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
|
curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz
|
||||||
|
curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
Step 2: extract the files. It doesn't really matter where
|
### Step 2: extract the files.
|
||||||
|
It doesn't really matter where
|
||||||
in your filesystem you put them, but they must go under a common
|
in your filesystem you put them, but they must go under a common
|
||||||
root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example,
|
root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example,
|
||||||
~/ISRI-OCRtk.
|
~/ISRI-OCRtk.
|
||||||
|
|
||||||
|
```
|
||||||
mkdir -p ~/ISRI-OCRtk
|
mkdir -p ~/ISRI-OCRtk
|
||||||
cd ~/ISRI-OCRtk
|
cd ~/ISRI-OCRtk
|
||||||
tar xzvf ~/isri-downloads/bus.3B.tar.gz
|
tar xzvf ~/isri-downloads/bus.3B.tar.gz
|
||||||
tar xzvf ~/isri-downloads/doe3.3B.tar.gz
|
tar xzvf ~/isri-downloads/doe3.3B.tar.gz
|
||||||
tar xzvf ~/isri-downloads/mag.3B.tar.gz
|
tar xzvf ~/isri-downloads/mag.3B.tar.gz
|
||||||
tar xzvf ~/isri-downloads/news.3B.tar.gz
|
tar xzvf ~/isri-downloads/news.3B.tar.gz
|
||||||
|
tar xzvf ~/isri-downloads/spn.3B.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
Step 4: Download the modified ISRI toolkit from:
|
Edit *~/ISRI-OCRtk/spn.3B/pages*
|
||||||
https://ancientgreekocr.org/ocr-evaluation-tools.git
|
delete the line containing the following imagename as it crashes tesseract.
|
||||||
|
|
||||||
make and install the tools in unlvtests/ocreval/bin by
|
7733_005.3B.tif
|
||||||
`make PREFIX=~/tesseract/unlvtests/ocreval install`
|
|
||||||
|
|
||||||
Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
|
### Step 3: Download the modified ISRI toolkit, make and install the tools :
|
||||||
|
These will be installed in /usr/local/bin.
|
||||||
|
|
||||||
Step 7: run unlvtests/runalltests.sh with the root ISRI data dir and testname:
|
```
|
||||||
unlvtests/runalltests.sh ~/ISRI-OCRtk tess4.0.0-beta.1
|
git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git
|
||||||
and go to the gym, have lunch etc.
|
cd ~/ocr-evaluation-tools
|
||||||
|
sudo make install
|
||||||
|
```
|
||||||
|
|
||||||
Step 8: There should be a file
|
### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
|
||||||
unlvtests/reports/tess4.0.0-beta.1.summary that contains the final summarized accuracy
|
|
||||||
|
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
|
||||||
|
|
||||||
|
```
|
||||||
|
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
|
||||||
|
```
|
||||||
|
and go to the gym, have lunch etc. It takes a while to run.
|
||||||
|
|
||||||
|
### Step 6: There should be a RELEASE.summary file
|
||||||
|
*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy
|
||||||
report and comparison with the 1995 results.
|
report and comparison with the 1995 results.
|
||||||
|
|
||||||
|
### Step 7: run the test for Spanish.
|
||||||
|
|
||||||
|
```
|
||||||
|
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
|
||||||
|
```
|
||||||
|
@ -43,17 +43,17 @@ do
|
|||||||
else
|
else
|
||||||
srcdir="$imdir"
|
srcdir="$imdir"
|
||||||
fi
|
fi
|
||||||
echo "$srcdir/$page.tif"
|
#echo "$srcdir/$page.tif"
|
||||||
# Count character errors.
|
# Count character errors.
|
||||||
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.acc"
|
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
|
||||||
accfiles="$accfiles $resdir/$page.acc"
|
accfiles="$accfiles $resdir/$page.acc"
|
||||||
# Count word errors.
|
# Count word errors.
|
||||||
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" "$resdir/$page.wa"
|
ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
|
||||||
wafiles="$wafiles $resdir/$page.wa"
|
wafiles="$wafiles $resdir/$page.wa"
|
||||||
done <"$pages"
|
done <"$pages"
|
||||||
|
|
||||||
echo "$accfiles"
|
#echo "$accfiles"
|
||||||
echo "$wafiles"
|
#echo "$wafiles"
|
||||||
|
|
||||||
unlvtests/ocreval/bin/accsum $accfiles >"unlvtests/reports/$setname.characc"
|
accsum $accfiles >"unlvtests/results/$setname.characc"
|
||||||
unlvtests/ocreval/bin/ocrevalutf8 unlvtests/ocreval/bin/wordaccsum $wafiles >"unlvtests/reports/$setname.wordacc"
|
wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
|
||||||
|
1
unlvtests/reports/1995.spn.3B.sum
Normal file
1
unlvtests/reports/1995.spn.3B.sum
Normal file
@ -0,0 +1 @@
|
|||||||
|
1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED
|
@ -15,9 +15,9 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
if [ $# -ne 2 ]
|
if [ $# -ne 4 ]
|
||||||
then
|
then
|
||||||
echo "Usage:$0 unlv-data-dir version-id"
|
echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [ ! -d src/api ]
|
if [ ! -d src/api ]
|
||||||
@ -30,7 +30,8 @@ then
|
|||||||
echo "Please build tesseract before running $0"
|
echo "Please build tesseract before running $0"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
tessdata=$3
|
||||||
|
lang=$4
|
||||||
|
|
||||||
#deltapc new old calculates the %change from old to new
|
#deltapc new old calculates the %change from old to new
|
||||||
deltapc() {
|
deltapc() {
|
||||||
@ -60,8 +61,20 @@ then
|
|||||||
bindir="./"
|
bindir="./"
|
||||||
fi
|
fi
|
||||||
rdir=unlvtests/reports
|
rdir=unlvtests/reports
|
||||||
#testsets="bus.3B doe3.3B mag.3B news.3B"
|
|
||||||
testsets="bus.3B"
|
if [ "$lang" = "eng" ]
|
||||||
|
then
|
||||||
|
testsets="bus.3B doe3.3B mag.3B news.3B"
|
||||||
|
#testsets="bus.3B"
|
||||||
|
else
|
||||||
|
if [ "$lang" = "spa" ]
|
||||||
|
then
|
||||||
|
testsets="spn.3B"
|
||||||
|
else
|
||||||
|
echo "Language has to be eng or spa"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
totalerrs=0
|
totalerrs=0
|
||||||
totalwerrs=0
|
totalwerrs=0
|
||||||
@ -74,7 +87,7 @@ do
|
|||||||
if [ -r "$imdir/$set/pages" ]
|
if [ -r "$imdir/$set/pages" ]
|
||||||
then
|
then
|
||||||
# Run tesseract on all the pages.
|
# Run tesseract on all the pages.
|
||||||
$bindir/runtestset.sh "$imdir/$set/pages"
|
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
|
||||||
# Count the errors on all the pages.
|
# Count the errors on all the pages.
|
||||||
$bindir/counttestset.sh "$imdir/$set/pages"
|
$bindir/counttestset.sh "$imdir/$set/pages"
|
||||||
# Get the old character word and nonstop word errors.
|
# Get the old character word and nonstop word errors.
|
||||||
@ -82,30 +95,30 @@ do
|
|||||||
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
|
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
|
||||||
oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
|
oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum")
|
||||||
# Get the new character word and nonstop word errors and accuracy.
|
# Get the new character word and nonstop word errors and accuracy.
|
||||||
cherrs=$(head -4 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
|
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||||
tr -d '[:blank:]')
|
tr -d '[:blank:]')
|
||||||
chacc=$(head -5 "unlvtests/reports/$set.characc" |tail -1 |cut -c1-9 |
|
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||||
tr -d '[:blank:]')
|
tr -d '[:blank:]')
|
||||||
wderrs=$(head -4 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
|
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||||
tr -d '[:blank:]')
|
tr -d '[:blank:]')
|
||||||
wdacc=$(head -5 "unlvtests/reports/$set.wordacc" |tail -1 |cut -c1-9 |
|
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||||
tr -d '[:blank:]')
|
tr -d '[:blank:]')
|
||||||
nswderrs=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
|
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||||
cut -c10-17 |tr -d '[:blank:]')
|
cut -c10-17 |tr -d '[:blank:]')
|
||||||
nswdacc=$(grep Total "unlvtests/reports/$set.wordacc" |head -2 |tail -1 |
|
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||||
cut -c19-26 |tr -d '[:blank:]')
|
cut -c19-26 |tr -d '[:blank:]')
|
||||||
# Compute the percent change.
|
# Compute the percent change.
|
||||||
chdelta=$(deltapc "$cherrs" "$olderrs")
|
chdelta=$(deltapc "$cherrs" "$olderrs")
|
||||||
wdelta=$(deltapc "$wderrs" "$oldwerrs")
|
wdelta=$(deltapc "$wderrs" "$oldwerrs")
|
||||||
nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
|
nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
|
||||||
sumfile=$rdir/$vid.$set.sum
|
sumfile=$rdir/$vid.$set.sum
|
||||||
if [ -r "unlvtests/reports/$set.times" ]
|
if [ -r "unlvtests/results/$set.times" ]
|
||||||
then
|
then
|
||||||
total_time=$(timesum "unlvtests/reports/$set.times")
|
total_time=$(timesum "unlvtests/results/$set.times")
|
||||||
if [ -r "unlvtests/reports/prev/$set.times" ]
|
if [ -r "unlvtests/results/prev/$set.times" ]
|
||||||
then
|
then
|
||||||
paste "unlvtests/reports/prev/$set.times" "unlvtests/reports/$set.times" |
|
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
|
||||||
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/reports/$set.timedelta"
|
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
total_time='0.0'
|
total_time='0.0'
|
||||||
@ -129,3 +142,6 @@ tfile=$rdir/$vid.total.sum
|
|||||||
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
|
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
|
||||||
- $wdelta% $totalnswerrs - $nswdelta%" >"$tfile"
|
- $wdelta% $totalnswerrs - $nswdelta%" >"$tfile"
|
||||||
cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
||||||
|
|
||||||
|
mv "$rdir/$vid".*.sum unlvtests/results/
|
||||||
|
cat "$rdir/$vid".summary
|
||||||
|
@ -15,9 +15,9 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
if [ $# -ne 1 ] && [ $# -ne 2 ]
|
if [ $# -ne 3 ] && [ $# -ne 4 ]
|
||||||
then
|
then
|
||||||
echo "Usage:$0 pagesfile [-zoning]"
|
echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [ ! -d src/api ]
|
if [ ! -d src/api ]
|
||||||
@ -36,13 +36,15 @@ then
|
|||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
tess="time -f %U -o times.txt src/api/tesseract"
|
tess="time -f %U -o times.txt src/api/tesseract"
|
||||||
export TESSDATA_PREFIX=$PWD/
|
#tess="time -f %U -o times.txt tesseract"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
tessdata=$2
|
||||||
|
lang=$3
|
||||||
pages=$1
|
pages=$1
|
||||||
imdir=${pages%/pages}
|
imdir=${pages%/pages}
|
||||||
setname=${imdir##*/}
|
setname=${imdir##*/}
|
||||||
if [ $# -eq 2 ] && [ "$2" = "-zoning" ]
|
if [ $# -eq 4 ] && [ "$4" = "-zoning" ]
|
||||||
then
|
then
|
||||||
config=unlv.auto
|
config=unlv.auto
|
||||||
resdir=unlvtests/results/zoning.$setname
|
resdir=unlvtests/results/zoning.$setname
|
||||||
@ -52,7 +54,7 @@ else
|
|||||||
fi
|
fi
|
||||||
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
|
echo -e "Testing on set $setname in directory $imdir to $resdir\n"
|
||||||
mkdir -p "$resdir"
|
mkdir -p "$resdir"
|
||||||
rm -f "unlvtests/reports/$setname.times"
|
rm -f "unlvtests/results/$setname.times"
|
||||||
while read page dir
|
while read page dir
|
||||||
do
|
do
|
||||||
# A pages file may be a list of files with subdirs or maybe just
|
# A pages file may be a list of files with subdirs or maybe just
|
||||||
@ -64,11 +66,11 @@ do
|
|||||||
srcdir="$imdir"
|
srcdir="$imdir"
|
||||||
fi
|
fi
|
||||||
# echo "$srcdir/$page.tif"
|
# echo "$srcdir/$page.tif"
|
||||||
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir ../tessdata_fast --oem 1 -l eng --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
|
$tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1"
|
||||||
if [ -r times.txt ]
|
if [ -r times.txt ]
|
||||||
then
|
then
|
||||||
read t <times.txt
|
read t <times.txt
|
||||||
echo "$page $t" >>"unlvtests/reports/$setname.times"
|
echo "$page $t" >>"unlvtests/results/$setname.times"
|
||||||
echo -e "\033M$page $t"
|
echo -e "\033M$page $t"
|
||||||
if [ "$t" = "Command terminated by signal 2" ]
|
if [ "$t" = "Command terminated by signal 2" ]
|
||||||
then
|
then
|
||||||
|
Loading…
Reference in New Issue
Block a user