tesseract/langtests/runlangtests.sh
2018-08-30 14:28:34 +00:00

101 lines
3.0 KiB
Bash
Executable File

#!/bin/bash
##############################################################################
# File: runalltests_spa.sh
# Description: Script to run a set of UNLV test sets for Spanish.
# based on runalltests.sh by Ray Smith
# Author: Shree Devi Kumar
# Created: June 09, 2018
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
if [ $# -ne 4 ]
then
echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode"
exit 1
fi
tessdata=$3
lang=$4
#timesum computes the total cpu time
timesum() {
awk ' BEGIN {
total = 0.0;
}
{
total += $2;
}
END {
printf("%.2f\n", total);
}' "$1"
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=langtests/reports
if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ]
then
testsets="frk-ligatures"
fi
totalerrs=0
totalwerrs=0
totalnswerrs=0
for set in $testsets
do
resdir=langtests/results/$set
mkdir -p "$resdir"
cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords"
if [ -r "$imdir/$set/pages" ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" $lang
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages" $lang
# Get the new character word and nonstop word errors and accuracy.
cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 |
tr -d '[:blank:]')
nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]')
nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]')
sumfile=$rdir/$vid.$set.sum
if [ -r "langtests/results/$set.times" ]
then
total_time=$(timesum "langtests/results/$set.times")
else
total_time='0.0'
fi
echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\
NonStopWErrors Accuracy TimeTaken">"$sumfile"
echo "$vid $set $cherrs $chacc $wderrs $wdacc\
$nswderrs $nswdacc ${total_time}s" >>"$sumfile"
fi
done
cat "$rdir/$vid".*.sum >"$rdir/$vid".summary
mv "$rdir/$vid".*.sum langtests/results/
cat "$rdir/$vid".summary