backport doc and testing changes from master (4.0)

This commit is contained in:
Zdenko Podobný 2017-05-05 20:40:35 +02:00
parent 5fe49627bf
commit 10f96d1e80
5 changed files with 64 additions and 58 deletions

View File

@ -84,14 +84,20 @@ before any 'configfile'.
SINGLE OPTIONS SINGLE OPTIONS
-------------- --------------
'-v':: '-h, --help'::
Show help message.
'--help-psm'::
Show page segmentation modes.
'-v, --version'::
Returns the current version of the tesseract(1) executable. Returns the current version of the tesseract(1) executable.
'--list-langs':: '--list-langs'::
list available languages for tesseract engine. Can be used with --tessdata-dir. List available languages for tesseract engine. Can be used with --tessdata-dir.
'--print-parameters':: '--print-parameters'::
print tesseract parameters to the stdout. Print tesseract parameters.

View File

@ -49,13 +49,13 @@ do
fi fi
# echo "$srcdir/$page.tif" # echo "$srcdir/$page.tif"
# Count character errors. # Count character errors.
testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc testing/unlv/accuracy "$srcdir/$page.txt" "$resdir/$page.txt" "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc" accfiles="$accfiles $resdir/$page.acc"
# Count word errors. # Count word errors.
testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa testing/unlv/wordacc "$srcdir/$page.txt" "$resdir/$page.txt" "$resdir/$page.wa"
wafiles="$wafiles $resdir/$page.wa" wafiles="$wafiles $resdir/$page.wa"
done <$pages done <"$pages"
testing/unlv/accsum $accfiles >testing/reports/$setname.characc testing/unlv/accsum "$accfiles" >"testing/reports/$setname.characc"
testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc testing/unlv/wordaccsum "$wafiles" >"testing/reports/$setname.wordacc"

View File

@ -33,21 +33,21 @@ do
if [ -r "$old/PAGES" ] if [ -r "$old/PAGES" ]
then then
new=${s#*/}.$ext new=${s#*/}.$ext
mkdir -p $new mkdir -p "$new"
echo "Set $old -> $new" echo "Set $old -> $new"
#The pages file had - instead of _ so fix it and add the extension. #The pages file had - instead of _ so fix it and add the extension.
for page in `cat $old/PAGES` for page in $(cat $old/PAGES)
do do
echo "${page%-*}_${page#*-}.$ext" echo "${page%-*}_${page#*-}.$ext"
done >$new/pages done >"$new/pages"
for f in `cat $new/pages` for f in $(cat "$new/pages")
do do
#Put a tif extension on the tif files. #Put a tif extension on the tif files.
cp $old/${old}_B/$f $new/$f.tif cp "$old/${old}_B/$f" "$new/$f.tif"
#Put a uzn extension on the zone files. #Put a uzn extension on the zone files.
cp $old/${old}_B/${f}Z $new/$f.uzn cp "$old/${old}_B/${f}Z" "$new/$f.uzn"
#Cat all the truth files together and put into a single txt file. #Cat all the truth files together and put into a single txt file.
cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt cat "$old/${old}_GT/${f%.$ext}".Z* >"$new/$f.txt"
done done
fi fi
done done

View File

@ -25,12 +25,12 @@ then
echo "Run $0 from the tesseract-ocr root directory!" echo "Run $0 from the tesseract-ocr root directory!"
exit 1 exit 1
fi fi
if [ ! -r api/tesseract -a ! -r tesseract.exe ] if [ ! -r api/tesseract ] && [ ! -r tesseract.exe ]
then then
echo "Please build tesseract before running $0" echo "Please build tesseract before running $0"
exit 1 exit 1
fi fi
if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ] if [ ! -r testing/unlv/accuracy ] && [ ! -r testing/unlv/accuracy.exe ]
then then
echo "Please download the UNLV accuracy tools (and build) to testing/unlv" echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
exit 1 exit 1
@ -39,7 +39,7 @@ fi
#deltapc new old calculates the %change from old to new #deltapc new old calculates the %change from old to new
deltapc() { deltapc() {
awk ' BEGIN { awk ' BEGIN {
printf("%.2f", 100.0*('$1'-'$2')/'$2'); printf("%.2f", 100.0*('"$1"'-'"$2"')/'"$2"');
}' }'
} }
@ -53,7 +53,7 @@ total = 0.0;
} }
END { END {
printf("%.2f\n", total); printf("%.2f\n", total);
}' $1 }' "$1"
} }
imdir="$1" imdir="$1"
@ -74,47 +74,47 @@ totaloldwerrs=0
totaloldnswerrs=0 totaloldnswerrs=0
for set in $testsets for set in $testsets
do do
if [ -r $imdir/$set/pages ] if [ -r "$imdir/$set/pages" ]
then then
# Run tesseract on all the pages. # Run tesseract on all the pages.
$bindir/runtestset.sh $imdir/$set/pages $bindir/runtestset.sh "$imdir/$set/pages"
# Count the errors on all the pages. # Count the errors on all the pages.
$bindir/counttestset.sh $imdir/$set/pages $bindir/counttestset.sh "$imdir/$set/pages"
# Get the old character word and nonstop word errors. # Get the old character word and nonstop word errors.
olderrs=`cat testing/reports/1995.$set.sum | cut -f3` olderrs=$(cut -f3 "testing/reports/1995.$set.sum")
oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6` oldwerrs=$(cut -f6 "testing/reports/1995.$set.sum")
oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9` oldnswerrs=$(cut -f9 "testing/reports/1995.$set.sum")
# Get the new character word and nonstop word errors and accuracy. # Get the new character word and nonstop word errors and accuracy.
cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 | cherrs=$(head -4 "testing/reports/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]'` tr -d '[:blank:]')
chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 | chacc=$(head -5 "testing/reports/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]'` tr -d '[:blank:]')
wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 | wderrs=$(head -4 "testing/reports/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]'` tr -d '[:blank:]')
wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 | wdacc=$(head -5 "testing/reports/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]'` tr -d '[:blank:]')
nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 | nswderrs=$(grep Total "testing/reports/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]'` cut -c10-17 |tr -d '[:blank:]')
nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 | nswdacc=$(grep Total "testing/reports/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]'` cut -c19-26 |tr -d '[:blank:]')
# Compute the percent change. # Compute the percent change.
chdelta=`deltapc $cherrs $olderrs` chdelta=$(deltapc "$cherrs" "$olderrs")
wdelta=`deltapc $wderrs $oldwerrs` wdelta=$(deltapc "$wderrs" "$oldwerrs")
nswdelta=`deltapc $nswderrs $oldnswerrs` nswdelta=$(deltapc "$nswderrs" "$oldnswerrs")
sumfile=$rdir/$vid.$set.sum sumfile=$rdir/$vid.$set.sum
if [ -r testing/reports/$set.times ] if [ -r "testing/reports/$set.times" ]
then then
total_time=`timesum testing/reports/$set.times` total_time=$(timesum "testing/reports/$set.times")
if [ -r testing/reports/prev/$set.times ] if [ -r "testing/reports/prev/$set.times" ]
then then
paste testing/reports/prev/$set.times testing/reports/$set.times | paste "testing/reports/prev/$set.times" "testing/reports/$set.times" |
awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >testing/reports/$set.timedelta awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"testing/reports/$set.timedelta"
fi fi
else else
total_time='0.0' total_time='0.0'
fi fi
echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\ echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\
$wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >$sumfile $wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >"$sumfile"
# Sum totals over all the testsets. # Sum totals over all the testsets.
let totalerrs=totalerrs+cherrs let totalerrs=totalerrs+cherrs
let totalwerrs=totalwerrs+wderrs let totalwerrs=totalwerrs+wderrs
@ -125,10 +125,10 @@ do
fi fi
done done
# Compute grand total percent change. # Compute grand total percent change.
chdelta=`deltapc $totalerrs $totalolderrs` chdelta=$(deltapc $totalerrs $totalolderrs)
wdelta=`deltapc $totalwerrs $totaloldwerrs` wdelta=$(deltapc $totalwerrs $totaloldwerrs)
nswdelta=`deltapc $totalnswerrs $totaloldnswerrs ` nswdelta=$(deltapc $totalnswerrs $totaloldnswerrs)
tfile=$rdir/$vid.total.sum tfile=$rdir/$vid.total.sum
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\ echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
- $wdelta% $totalnswerrs - $nswdelta%" >$tfile - $wdelta% $totalnswerrs - $nswdelta%" >"$tfile"
cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary

View File

@ -15,7 +15,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
if [ $# -ne 1 -a $# -ne 2 ] if [ $# -ne 1 ] && [ $# -ne 2 ]
then then
echo "Usage:$0 pagesfile [-zoning]" echo "Usage:$0 pagesfile [-zoning]"
exit 1 exit 1
@ -42,7 +42,7 @@ fi
pages=$1 pages=$1
imdir=${pages%/pages} imdir=${pages%/pages}
setname=${imdir##*/} setname=${imdir##*/}
if [ $# -eq 2 -a "$2" = "-zoning" ] if [ $# -eq 2 ] && [ "$2" = "-zoning" ]
then then
config=unlv.auto config=unlv.auto
resdir=testing/results/zoning.$setname resdir=testing/results/zoning.$setname
@ -51,8 +51,8 @@ else
resdir=testing/results/$setname resdir=testing/results/$setname
fi fi
echo -e "Testing on set $setname in directory $imdir to $resdir\n" echo -e "Testing on set $setname in directory $imdir to $resdir\n"
mkdir -p $resdir mkdir -p "$resdir"
rm -f testing/reports/$setname.times rm -f "testing/reports/$setname.times"
while read page dir while read page dir
do do
# A pages file may be a list of files with subdirs or maybe just # A pages file may be a list of files with subdirs or maybe just
@ -64,15 +64,15 @@ do
srcdir="$imdir" srcdir="$imdir"
fi fi
# echo "$srcdir/$page.tif" # echo "$srcdir/$page.tif"
$tess $srcdir/$page.tif $resdir/$page --psm 6 $config 2>&1 |grep -v "OCR Engine" $tess "$srcdir/$page.tif" "$resdir/$page" --psm 6 $config 2>&1 |grep -v "OCR Engine"
if [ -r times.txt ] if [ -r times.txt ]
then then
read t <times.txt read t <times.txt
echo "$page $t" >>testing/reports/$setname.times echo "$page $t" >>"testing/reports/$setname.times"
echo -e "\033M$page $t" echo -e "\033M$page $t"
if [ "$t" = "Command terminated by signal 2" ] if [ "$t" = "Command terminated by signal 2" ]
then then
exit 0 exit 0
fi fi
fi fi
done <$pages done <"$pages"