mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-03 17:29:04 +08:00
106 lines
3.2 KiB
Bash
Executable File
106 lines
3.2 KiB
Bash
Executable File
#!/bin/bash
|
|
##############################################################################
|
|
# File: runlangtests.sh
|
|
# Description: Script to run a set of accuracy test sets for any language.
|
|
# based on runalltests.sh by Ray Smith
|
|
# Author: Shree Devi Kumar
|
|
# Created: June 09, 2018
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
##############################################################################
|
|
if [ $# -ne 5 ]
|
|
then
|
|
echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode imgext"
|
|
exit 1
|
|
fi
|
|
|
|
tessdata=$3
|
|
lang=$4
|
|
imgext=$5
|
|
|
|
#timesum computes the total cpu time
|
|
timesum() {
|
|
awk ' BEGIN {
|
|
total = 0.0;
|
|
}
|
|
{
|
|
total += $2;
|
|
}
|
|
END {
|
|
printf("%.2f\n", total);
|
|
}' "$1"
|
|
}
|
|
|
|
imdir="$1"
|
|
vid="$2"
|
|
bindir=${0%/*}
|
|
if [ "$bindir" = "$0" ]
|
|
then
|
|
bindir="./"
|
|
fi
|
|
rdir=langtests/reports
|
|
if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ]
|
|
then
|
|
testsets="frk-ligatures"
|
|
fi
|
|
if [ "$lang" = "san" ] || [ "$lang" = "Devanagari" ]
|
|
then
|
|
testsets="san-fontsamples san-oldstyle san-shreelipi san-alphabetsamples"
|
|
### testsets="san-fontsamples"
|
|
fi
|
|
|
|
totalerrs=0
|
|
totalwerrs=0
|
|
totalnswerrs=0
|
|
for set in $testsets
|
|
do
|
|
resdir=langtests/results/$set
|
|
mkdir -p "$resdir"
|
|
cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
|
|
if [ -r "$imdir/$set/pages" ]
|
|
then
|
|
# Run tesseract on all the pages.
|
|
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" "$imgext"
|
|
# Count the errors on all the pages.
|
|
$bindir/counttestset.sh "$imdir/$set/pages" $lang
|
|
# Get the new character word and nonstop word errors and accuracy.
|
|
cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
|
tr -d '[:blank:]')
|
|
chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
|
|
tr -d '[:blank:]')
|
|
wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
|
tr -d '[:blank:]')
|
|
wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
|
|
tr -d '[:blank:]')
|
|
nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
|
cut -c10-17 |tr -d '[:blank:]')
|
|
nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
|
|
cut -c19-26 |tr -d '[:blank:]')
|
|
|
|
sumfile=$rdir/$vid.$set.sum
|
|
if [ -r "langtests/results/$set.times" ]
|
|
then
|
|
total_time=$(timesum "langtests/results/$set.times")
|
|
else
|
|
total_time='0.0'
|
|
fi
|
|
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
|
|
NonStopWErrors Accuracy TimeTaken">"$sumfile"
|
|
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
|
|
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
|
|
fi
|
|
done
|
|
|
|
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
|
|
|
|
mv "$rdir/$vid".*.sum langtests/results/
|
|
cat "$rdir/$vid".summary
|