mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
update Spanish UNLV test, use spa.stopwords, iconv to UTF-8
This commit is contained in:
parent
481d7775c6
commit
6559af0c9d
@ -34,11 +34,15 @@ tar xzvf ~/isri-downloads/doe3.3B.tar.gz
|
||||
tar xzvf ~/isri-downloads/mag.3B.tar.gz
|
||||
tar xzvf ~/isri-downloads/news.3B.tar.gz
|
||||
tar xzvf ~/isri-downloads/spn.3B.tar.gz
|
||||
mkdir -p stopwords
|
||||
cd stopwords
|
||||
wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt
|
||||
```
|
||||
Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt
|
||||
wordacc uses a space delimited stopwords file, not line delimited.
|
||||
|
||||
Edit *~/ISRI-OCRtk/spn.3B/pages*
|
||||
delete the line containing the following imagename as it crashes tesseract.
|
||||
|
||||
7733_005.3B.tif
|
||||
|
||||
### Step 3: Download the modified ISRI toolkit, make and install the tools :
|
||||
@ -52,10 +56,10 @@ sudo make install
|
||||
|
||||
### Step 4: cd back to your main tesseract-ocr dir and Build tesseract.
|
||||
|
||||
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir and language:
|
||||
### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir:
|
||||
|
||||
```
|
||||
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast eng
|
||||
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast
|
||||
```
|
||||
and go to the gym, have lunch etc. It takes a while to run.
|
||||
|
||||
@ -66,5 +70,23 @@ report and comparison with the 1995 results.
|
||||
### Step 7: run the test for Spanish.
|
||||
|
||||
```
|
||||
unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast spa
|
||||
unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast
|
||||
```
|
||||
|
||||
#### Notes from Nick White regarding wordacc
|
||||
|
||||
If you just want to remove all lines which have 100% recognition,
|
||||
you can add a 'awk' command like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}'
|
||||
results.txt
|
||||
|
||||
or if you've already got a results file you want to change, you can do this:
|
||||
|
||||
awk '$3 != 100 {print $0}' results.txt newresults.txt
|
||||
|
||||
If you only want the last sections where things are broken down by
|
||||
word, you can add a sed commend, like this:
|
||||
|
||||
ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$
|
||||
!d' | awk '$3 != 100 {print $0}' results.txt
|
||||
|
@ -15,9 +15,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 1 ]
|
||||
if [ $# -ne 2 ]
|
||||
then
|
||||
echo "Usage:$0 pagesfile"
|
||||
echo "Usage:$0 pagesfile langcode"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d src/api ]
|
||||
@ -27,6 +27,7 @@ then
|
||||
fi
|
||||
|
||||
pages=$1
|
||||
langcode=$2
|
||||
|
||||
imdir=${pages%/pages}
|
||||
setname=${imdir##*/}
|
||||
@ -45,15 +46,22 @@ do
|
||||
fi
|
||||
#echo "$srcdir/$page.tif"
|
||||
# Count character errors.
|
||||
ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.acc"
|
||||
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
|
||||
iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
|
||||
ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
|
||||
accfiles="$accfiles $resdir/$page.acc"
|
||||
# Count word errors.
|
||||
ocrevalutf8 wordacc "$srcdir/$page.txt" "$resdir/$page.unlv" > "$resdir/$page.wa"
|
||||
#langcode should be either eng or spa
|
||||
if [ "$langcode" = "eng" ]
|
||||
then
|
||||
ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
|
||||
else
|
||||
cp /home/ubuntu/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords"
|
||||
ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa"
|
||||
fi
|
||||
wafiles="$wafiles $resdir/$page.wa"
|
||||
done <"$pages"
|
||||
|
||||
#echo "$accfiles"
|
||||
#echo "$wafiles"
|
||||
|
||||
accsum $accfiles >"unlvtests/results/$setname.characc"
|
||||
wordaccsum $wafiles >"unlvtests/results/$setname.wordacc"
|
||||
|
||||
|
@ -1 +0,0 @@
|
||||
1995 spn.3B 100 95.00% 0.00% 100 95.00% 0.00% 100 95.00% 0.00% WAS NOT TESTED
|
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
# File: runalltests.sh
|
||||
# Description: Script to run a set of UNLV test sets.
|
||||
# Description: Script to run a set of UNLV test sets for English.
|
||||
# Author: Ray Smith
|
||||
# Created: Thu Jun 14 08:21:01 PDT 2007
|
||||
#
|
||||
@ -15,9 +15,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
if [ $# -ne 4 ]
|
||||
if [ $# -ne 3 ]
|
||||
then
|
||||
echo "Usage:$0 unlv-data-dir version-id tessdata-dir lang "
|
||||
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d src/api ]
|
||||
@ -31,7 +31,6 @@ then
|
||||
exit 1
|
||||
fi
|
||||
tessdata=$3
|
||||
lang=$4
|
||||
|
||||
#deltapc new old calculates the %change from old to new
|
||||
deltapc() {
|
||||
@ -62,19 +61,8 @@ then
|
||||
fi
|
||||
rdir=unlvtests/reports
|
||||
|
||||
if [ "$lang" = "eng" ]
|
||||
then
|
||||
testsets="bus.3B doe3.3B mag.3B news.3B"
|
||||
#testsets="bus.3B"
|
||||
else
|
||||
if [ "$lang" = "spa" ]
|
||||
then
|
||||
testsets="spn.3B"
|
||||
else
|
||||
echo "Language has to be eng or spa"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
testsets="bus.3B doe3.3B mag.3B news.3B"
|
||||
#testsets="bus.3B"
|
||||
|
||||
totalerrs=0
|
||||
totalwerrs=0
|
||||
@ -87,7 +75,7 @@ do
|
||||
if [ -r "$imdir/$set/pages" ]
|
||||
then
|
||||
# Run tesseract on all the pages.
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang"
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
|
||||
# Count the errors on all the pages.
|
||||
$bindir/counttestset.sh "$imdir/$set/pages"
|
||||
# Get the old character word and nonstop word errors.
|
||||
|
109
unlvtests/runalltests_spa.sh
Executable file
109
unlvtests/runalltests_spa.sh
Executable file
@ -0,0 +1,109 @@
|
||||
#!/bin/bash
|
||||
##############################################################################
|
||||
# File: runalltests_spa.sh
|
||||
# Description: Script to run a set of UNLV test sets for Spanish.
|
||||
# based on runalltests.sh by Ray Smith
|
||||
# Author: Shree Devi Kumar
|
||||
# Created: June 09, 2018
|
||||
#
|
||||
# (C) Copyright 2007, Google Inc.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
##############################################################################
|
||||
if [ $# -ne 3 ]
|
||||
then
|
||||
echo "Usage:$0 unlv-data-dir version-id tessdata-dir"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d src/api ]
|
||||
then
|
||||
echo "Run $0 from the tesseract-ocr root directory!"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ]
|
||||
then
|
||||
echo "Please build tesseract before running $0"
|
||||
exit 1
|
||||
fi
|
||||
tessdata=$3
|
||||
lang=$4
|
||||
|
||||
#timesum computes the total cpu time
|
||||
timesum() {
|
||||
awk ' BEGIN {
|
||||
total = 0.0;
|
||||
}
|
||||
{
|
||||
total += $2;
|
||||
}
|
||||
END {
|
||||
printf("%.2f\n", total);
|
||||
}' "$1"
|
||||
}
|
||||
|
||||
imdir="$1"
|
||||
vid="$2"
|
||||
bindir=${0%/*}
|
||||
if [ "$bindir" = "$0" ]
|
||||
then
|
||||
bindir="./"
|
||||
fi
|
||||
rdir=unlvtests/reports
|
||||
|
||||
testsets="spn.3B"
|
||||
|
||||
totalerrs=0
|
||||
totalwerrs=0
|
||||
totalnswerrs=0
|
||||
for set in $testsets
|
||||
do
|
||||
if [ -r "$imdir/$set/pages" ]
|
||||
then
|
||||
# Run tesseract on all the pages.
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa"
|
||||
# Count the errors on all the pages.
|
||||
$bindir/counttestset.sh "$imdir/$set/pages" "spa"
|
||||
# Get the new character word and nonstop word errors and accuracy.
|
||||
cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
||||
tr -d '[:blank:]')
|
||||
nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c10-17 |tr -d '[:blank:]')
|
||||
nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 |
|
||||
cut -c19-26 |tr -d '[:blank:]')
|
||||
|
||||
sumfile=$rdir/$vid.$set.sum
|
||||
if [ -r "unlvtests/results/$set.times" ]
|
||||
then
|
||||
total_time=$(timesum "unlvtests/results/$set.times")
|
||||
if [ -r "unlvtests/results/prev/$set.times" ]
|
||||
then
|
||||
paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" |
|
||||
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta"
|
||||
fi
|
||||
else
|
||||
total_time='0.0'
|
||||
fi
|
||||
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
|
||||
NonStopWordErrors Accuracy TimeTaken">"$sumfile"
|
||||
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
|
||||
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
|
||||
fi
|
||||
done
|
||||
|
||||
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
||||
|
||||
mv "$rdir/$vid".*.sum unlvtests/results/
|
||||
cat "$rdir/$vid".summary
|
Loading…
Reference in New Issue
Block a user