diff --git a/Makefile.am b/Makefile.am index 5a4f3163..55d17ab3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -24,7 +24,7 @@ SUBDIRS += src/ccmain src/api . tessdata doc unittest EXTRA_DIST = README.md LICENSE EXTRA_DIST += aclocal.m4 config configure.ac autogen.sh -EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc langtests unlvtests +EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc EXTRA_DIST += CMakeLists.txt tesseract.pc.cmake cmake VERSION src/vs2010 cppan.yml DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR) diff --git a/configure.ac b/configure.ac index 28258066..c129d327 100644 --- a/configure.ac +++ b/configure.ac @@ -463,7 +463,6 @@ fi # Output files AC_CONFIG_FILES([Makefile tesseract.pc]) -AC_CONFIG_FILES([langtests/Makefile]) AC_CONFIG_FILES([src/api/Makefile]) AC_CONFIG_FILES([src/api/tess_version.h]) AC_CONFIG_FILES([src/arch/Makefile]) @@ -481,7 +480,6 @@ AC_CONFIG_FILES([src/wordrec/Makefile]) AC_CONFIG_FILES([tessdata/Makefile]) AC_CONFIG_FILES([tessdata/configs/Makefile]) AC_CONFIG_FILES([tessdata/tessconfigs/Makefile]) -AC_CONFIG_FILES([unlvtests/Makefile]) AC_CONFIG_FILES([unittest/Makefile]) AC_CONFIG_FILES([java/Makefile]) AC_CONFIG_FILES([java/com/Makefile]) diff --git a/langtests/.gitignore b/langtests/.gitignore deleted file mode 100644 index d9f9b7fa..00000000 --- a/langtests/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# -results/* diff --git a/langtests/Makefile.am b/langtests/Makefile.am deleted file mode 100644 index 2103eeef..00000000 --- a/langtests/Makefile.am +++ /dev/null @@ -1,8 +0,0 @@ - -EXTRA_DIST = README.md -EXTRA_DIST += frk_setup.sh -EXTRA_DIST += frk_test.sh -EXTRA_DIST += counttestset.sh -EXTRA_DIST += runlangtests.sh -EXTRA_DIST += runtestset.sh -EXTRA_DIST += reports/* diff --git a/langtests/README.md b/langtests/README.md deleted file mode 100644 index 2730fd89..00000000 --- a/langtests/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# Language tests. -The scripts in this directory make it possible to test Accuracy of Tesseract for different languages. -## Setup -### Step 1: If not already installed, download the modified ISRI toolkit, -make and install the tools in /usr/local/bin. -``` -git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git -cd ~/ocr-evaluation-tools -sudo make install -``` -### Step 2: If not alrady built, Build tesseract. -Use binaries from the tesseract/src/api and tesseract/src/training directory. -### Step 3 -Download images and corresponding ground truth text for the language to be tested. -Each testset should have only one kind of images (eg. tif, png, jpg etc). -The ground truth text files should have the same base filename with txt extension. -As needed, modify the filenames and create the `pages` file for each testset. -Instructions for testing Fraktur and Sanskrit languages are given below as an example. -## Testing for Fraktur - frk and script/Fraktur -### Download the images and groundtruth, modify to required format. -``` -bash -x frk_setup.sh -``` -### Run tests for Fraktur - frk and script/Fraktur -``` -bash -x frk_test.sh -``` -## Testing for Sanskrit - san and script/Devanagari -### Download the images and groundtruth, modify to required format. -``` -bash -x deva_setup.sh -``` -### Run tests -``` -bash -x deva_test.sh -``` - -### Notes from Nick White regarding wordacc - -If you just want to remove all lines which have 100% recognition, -you can add a 'awk' command like this: - -ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}' -results.txt - -or if you've already got a results file you want to change, you can do this: - -awk '$3 != 100 {print $0}' results.txt newresults.txt - -If you only want the last sections where things are broken down by -word, you can add a sed commend, like this: - -ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$ -!d' | awk '$3 != 100 {print $0}' results.txt diff --git a/langtests/counttestset.sh b/langtests/counttestset.sh deleted file mode 100755 index 9c3c825d..00000000 --- a/langtests/counttestset.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -# File: counttestset.sh -# Description: Script to count the errors on a single UNLV set. -# Author: Ray Smith -# Created: Wed Jun 13 11:58:01 PDT 2007 -# -# (C) Copyright 2007, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ $# -ne 2 ] -then - echo "Usage:$0 pagesfile langcode" - exit 1 -fi - -pages=$1 -langcode=$2 - -imdir=${pages%/pages} -setname=${imdir##*/} -resdir=langtests/results/$setname -mkdir -p langtests/reports -echo "Counting on set $setname in directory $imdir to $resdir" -accfiles="" -wafiles="" -while read page dir -do - if [ "$dir" ] - then - srcdir="$imdir/$dir" - else - srcdir="$imdir" - fi - echo "$srcdir/$page" - # Count character errors. - ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc" - accfiles="$accfiles $resdir/$page.acc" - # Count word errors. - ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa" - wafiles="$wafiles $resdir/$page.wa" -done <"$pages" - -accsum $accfiles >"langtests/results/$setname.characc" -wordaccsum $wafiles >"langtests/results/$setname.wordacc" diff --git a/langtests/deva_setup.sh b/langtests/deva_setup.sh deleted file mode 100644 index 15d5d8be..00000000 --- a/langtests/deva_setup.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# -mkdir -p ~/lang-files -rm -rf ~/lang-files/san-* -for testset in vedic fontsamples oldstyle shreelipi alphabetsamples -do - cd ~/lang-files - mkdir -p ./san-$testset - cp ~/lang-deva-downloads/imagessan/$testset/*.* ./san-$testset/ - cd ./san-$testset/ - rename s/-gt.txt/.txt/ *.txt - ls -1 *.png >pages - sed -i -e 's/.png//g' pages -done - -mkdir -p ~/lang-stopwords -cd ~/lang-stopwords -cp ~/lang-deva-downloads/imagessan/stopwords.txt ./san.stopwords.txt diff --git a/langtests/deva_test.sh b/langtests/deva_test.sh deleted file mode 100644 index 9add94b4..00000000 --- a/langtests/deva_test.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# run langtests/runlangtests.sh with the root data dir, testname, tessdata-dir, language code and image extension - -cd ~/tesseract - -langtests/runlangtests.sh ~/lang-files 4_fast_Devanagari ../tessdata_fast/script Devanagari png -langtests/runlangtests.sh ~/lang-files 4_best_int_Devanagari ../tessdata/script Devanagari png -langtests/runlangtests.sh ~/lang-files 4_best_Devanagari ../tessdata_best/script Devanagari png -langtests/runlangtests.sh ~/lang-files 4_fast_san ../tessdata_fast san png -langtests/runlangtests.sh ~/lang-files 4_best_int_san ../tessdata san png -langtests/runlangtests.sh ~/lang-files 4_best_san ../tessdata_best san png - -langtests/runlangtests.sh ~/lang-files 4_plus40k_san ../tesstutorial-deva san png - -#/home/ubuntu/tesstutorial-deva/san.traineddata at n iterations - -### It takes a while to run. - diff --git a/langtests/frk_setup.sh b/langtests/frk_setup.sh deleted file mode 100644 index e86b6109..00000000 --- a/langtests/frk_setup.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# -mkdir -p ~/lang-downloads -cd ~/lang-downloads -wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip -wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip - -mkdir -p ~/lang-files -cd ~/lang-files -unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk -unzip ~/lang-downloads/frk-stweil-gt.zip -d frk -mkdir -p ./frk-ligatures -cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/ -cp ./frk/gt/*.txt ./frk-ligatures/ - -cd ./frk-ligatures/ -ls -1 *.tif >pages -sed -i -e 's/.tif//g' pages - -mkdir -p ~/lang-stopwords -cd ~/lang-stopwords -wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt - -echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited." diff --git a/langtests/frk_test.sh b/langtests/frk_test.sh deleted file mode 100644 index 0ab32821..00000000 --- a/langtests/frk_test.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# -# run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language code: - -cd ~/tesseract -langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur tif - -langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk tif -langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk tif -langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk tif - -### It takes a while to run. - diff --git a/langtests/reports/4_best_Devanagari.summary b/langtests/reports/4_best_Devanagari.summary deleted file mode 100644 index ad988e6e..00000000 --- a/langtests/reports/4_best_Devanagari.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_Devanagari san-alphabetsamples 2013 56.17% 1323 12.27% 1323 12.27 606.28s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_Devanagari san-fontsamples 388 94.82% 87 86.38% 87 86.38 570.17s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_Devanagari san-oldstyle 2796 59.93% 523 39.61% 523 39.61 447.73s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_Devanagari san-shreelipi 830 94.01% 311 81.40% 311 81.40 1137.51s diff --git a/langtests/reports/4_best_frk.summary b/langtests/reports/4_best_frk.summary deleted file mode 100644 index 0b963f68..00000000 --- a/langtests/reports/4_best_frk.summary +++ /dev/null @@ -1,2 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 94.29s diff --git a/langtests/reports/4_best_int_Devanagari.summary b/langtests/reports/4_best_int_Devanagari.summary deleted file mode 100644 index fe31dc7c..00000000 --- a/langtests/reports/4_best_int_Devanagari.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_Devanagari san-alphabetsamples 2010 56.24% 1321 12.40% 1321 12.40 556.26s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_Devanagari san-fontsamples 396 94.72% 89 86.07% 89 86.07 524.07s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_Devanagari san-oldstyle 2812 59.70% 523 39.61% 523 39.61 416.57s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_Devanagari san-shreelipi 829 94.01% 314 81.22% 314 81.22 1087.02s diff --git a/langtests/reports/4_best_int_frk.summary b/langtests/reports/4_best_int_frk.summary deleted file mode 100644 index 5d2bc776..00000000 --- a/langtests/reports/4_best_int_frk.summary +++ /dev/null @@ -1,2 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 367.73s diff --git a/langtests/reports/4_best_int_san.summary b/langtests/reports/4_best_int_san.summary deleted file mode 100644 index 3e140f1c..00000000 --- a/langtests/reports/4_best_int_san.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_san san-alphabetsamples 2342 49.01% 1353 10.28% 1353 10.28 281.60s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_san san-fontsamples 474 93.68% 126 80.28% 126 80.28 281.05s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 206.20s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_int_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 606.80s diff --git a/langtests/reports/4_best_san.summary b/langtests/reports/4_best_san.summary deleted file mode 100644 index 948f62a4..00000000 --- a/langtests/reports/4_best_san.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_san san-alphabetsamples 2335 49.16% 1348 10.61% 1348 10.61 300.24s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_san san-fontsamples 473 93.69% 126 80.28% 126 80.28 267.05s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_san san-oldstyle 3121 55.27% 598 30.95% 598 30.95 205.28s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_best_san san-shreelipi 1168 91.56% 414 75.24% 414 75.24 610.52s diff --git a/langtests/reports/4_fast_Devanagari.summary b/langtests/reports/4_fast_Devanagari.summary deleted file mode 100644 index 356e6882..00000000 --- a/langtests/reports/4_fast_Devanagari.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_Devanagari san-alphabetsamples 2017 56.09% 1317 12.67% 1317 12.67 400.38s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_Devanagari san-fontsamples 433 94.22% 108 83.10% 108 83.10 287.48s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_Devanagari san-oldstyle 2883 58.68% 543 37.30% 543 37.30 289.85s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_Devanagari san-shreelipi 750 94.58% 279 83.31% 279 83.31 813.19s diff --git a/langtests/reports/4_fast_Fraktur.summary b/langtests/reports/4_fast_Fraktur.summary deleted file mode 100644 index b8f8e81b..00000000 --- a/langtests/reports/4_fast_Fraktur.summary +++ /dev/null @@ -1,2 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 91.29s diff --git a/langtests/reports/4_fast_frk.summary b/langtests/reports/4_fast_frk.summary deleted file mode 100644 index 42ce1bcd..00000000 --- a/langtests/reports/4_fast_frk.summary +++ /dev/null @@ -1,2 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 89.98s diff --git a/langtests/reports/4_fast_san.summary b/langtests/reports/4_fast_san.summary deleted file mode 100644 index e37ff3ca..00000000 --- a/langtests/reports/4_fast_san.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_san san-alphabetsamples 2342 49.01% 1353 10.28% 1353 10.28 276.73s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_san san-fontsamples 474 93.68% 126 80.28% 126 80.28 278.34s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 222.35s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_fast_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 626.40s diff --git a/langtests/reports/4_plus10k_san.summary b/langtests/reports/4_plus10k_san.summary deleted file mode 100644 index e8251c99..00000000 --- a/langtests/reports/4_plus10k_san.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus10k_san san-alphabetsamples 1725 62.44% 1112 26.26% 1112 26.26 160.48s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus10k_san san-fontsamples 349 95.34% 73 88.58% 73 88.58 138.09s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus10k_san san-oldstyle 2818 59.62% 548 36.72% 548 36.72 120.83s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus10k_san san-shreelipi 746 94.61% 279 83.31% 279 83.31 292.70s diff --git a/langtests/reports/4_plus20k_san.summary b/langtests/reports/4_plus20k_san.summary deleted file mode 100644 index 640f8149..00000000 --- a/langtests/reports/4_plus20k_san.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus20k_san san-alphabetsamples 1441 68.63% 841 44.23% 841 44.23 156.57s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus20k_san san-fontsamples 356 95.25% 75 88.26% 75 88.26 135.13s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus20k_san san-oldstyle 2862 58.99% 555 35.91% 555 35.91 118.21s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus20k_san san-shreelipi 726 94.76% 267 84.03% 267 84.03 295.68s diff --git a/langtests/reports/4_plus30k_san.summary b/langtests/reports/4_plus30k_san.summary deleted file mode 100644 index febc5757..00000000 --- a/langtests/reports/4_plus30k_san.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus30k_san san-alphabetsamples 1656 63.95% 937 37.86% 937 37.86 615.62s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus30k_san san-fontsamples 429 94.28% 89 86.07% 89 86.07 617.42s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus30k_san san-oldstyle 2885 58.66% 561 35.22% 561 35.22 432.58s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus30k_san san-shreelipi 447 96.77% 123 92.64% 123 92.64 1081.29s diff --git a/langtests/reports/4_plus40k_san.summary b/langtests/reports/4_plus40k_san.summary deleted file mode 100644 index 1ead5c1f..00000000 --- a/langtests/reports/4_plus40k_san.summary +++ /dev/null @@ -1,8 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus40k_san san-alphabetsamples 1380 69.95% 775 48.61% 775 48.61 1198.16s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus40k_san san-fontsamples 401 94.65% 79 87.64% 79 87.64 1275.08s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus40k_san san-oldstyle 2860 59.01% 534 38.34% 534 38.34 977.65s -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWErrors Accuracy TimeTaken -4_plus40k_san san-shreelipi 441 96.81% 113 93.24% 113 93.24 2301.53s diff --git a/langtests/runlangtests.sh b/langtests/runlangtests.sh deleted file mode 100755 index 300c68ee..00000000 --- a/langtests/runlangtests.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -############################################################################## -# File: runlangtests.sh -# Description: Script to run a set of accuracy test sets for any language. -# based on runalltests.sh by Ray Smith -# Author: Shree Devi Kumar -# Created: June 09, 2018 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -############################################################################## -if [ $# -ne 5 ] -then - echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode imgext" - exit 1 -fi - -tessdata=$3 -lang=$4 -imgext=$5 - -#timesum computes the total cpu time -timesum() { -awk ' BEGIN { -total = 0.0; -} -{ - total += $2; -} -END { - printf("%.2f\n", total); -}' "$1" -} - -imdir="$1" -vid="$2" -bindir=${0%/*} -if [ "$bindir" = "$0" ] -then - bindir="./" -fi -rdir=langtests/reports -if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ] - then - testsets="frk-ligatures" -fi -if [ "$lang" = "san" ] || [ "$lang" = "Devanagari" ] - then - testsets="san-fontsamples san-oldstyle san-shreelipi san-alphabetsamples" - ### testsets="san-fontsamples" -fi - -totalerrs=0 -totalwerrs=0 -totalnswerrs=0 -for set in $testsets -do - resdir=langtests/results/$set - mkdir -p "$resdir" - cp ~/lang-stopwords/frk.stopwords.txt "$resdir/$lang.stopwords" - if [ -r "$imdir/$set/pages" ] - then - # Run tesseract on all the pages. - $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" "$imgext" - # Count the errors on all the pages. - $bindir/counttestset.sh "$imdir/$set/pages" $lang - # Get the new character word and nonstop word errors and accuracy. - cherrs=$(head -4 "langtests/results/$set.characc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - chacc=$(head -5 "langtests/results/$set.characc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - wderrs=$(head -4 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - wdacc=$(head -5 "langtests/results/$set.wordacc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - nswderrs=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 | - cut -c10-17 |tr -d '[:blank:]') - nswdacc=$(grep Total "langtests/results/$set.wordacc" |head -2 |tail -1 | - cut -c19-26 |tr -d '[:blank:]') - - sumfile=$rdir/$vid.$set.sum - if [ -r "langtests/results/$set.times" ] - then - total_time=$(timesum "langtests/results/$set.times") - else - total_time='0.0' - fi - echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\ - NonStopWErrors Accuracy TimeTaken">"$sumfile" - echo "$vid $set $cherrs $chacc $wderrs $wdacc\ - $nswderrs $nswdacc ${total_time}s" >>"$sumfile" - fi -done - -cat "$rdir/$vid".*.sum >"$rdir/$vid".summary - -mv "$rdir/$vid".*.sum langtests/results/ -cat "$rdir/$vid".summary diff --git a/langtests/runtestset.sh b/langtests/runtestset.sh deleted file mode 100755 index 3771e79d..00000000 --- a/langtests/runtestset.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# File: runtestset.sh -# Description: Script to run tesseract on a single UNLV set. -# Author: Ray Smith -# Created: Wed Jun 13 10:13:01 PDT 2007 -# -# (C) Copyright 2007, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ $# -ne 4 ] -then - echo "Usage:$0 pagesfile tessdata-dir langcode imgext" - exit 1 -fi - -tess="time -f %U -o times.txt ./src/api/tesseract" - -tessdata=$2 -langcode=$3 -imgext=$4 -pages=$1 -imdir=${pages%/pages} -setname=${imdir##*/} - -config="" -resdir=langtests/results/$setname - -echo -e "Testing on set $setname in directory $imdir to $resdir\n" -mkdir -p "$resdir" -rm -f "langtests/results/$setname.times" -while read page dir -do - # A pages file may be a list of files with subdirs or maybe just - # a plain list of files so accommodate both. - if [ "$dir" ] - then - srcdir="$imdir/$dir" - else - srcdir="$imdir" - fi - echo "$srcdir/$page" - $tess "$srcdir/$page.$imgext" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" - if [ -r times.txt ] - then - read t >"langtests/results/$setname.times" - echo -e "\033M$page $t" - if [ "$t" = "Command terminated by signal 2" ] - then - exit 0 - fi - fi -done <"$pages" diff --git a/unlvtests/Makefile.am b/unlvtests/Makefile.am deleted file mode 100644 index 68471bf2..00000000 --- a/unlvtests/Makefile.am +++ /dev/null @@ -1,12 +0,0 @@ - -EXTRA_DIST = README.md -EXTRA_DIST += counttestset.sh -EXTRA_DIST += runalltests.sh -EXTRA_DIST += runalltests_spa.sh -EXTRA_DIST += runtestset.sh -EXTRA_DIST += reports/1995.bus.3B.sum -EXTRA_DIST += reports/1995.doe3.3B.sum -EXTRA_DIST += reports/1995.mag.3B.sum -EXTRA_DIST += reports/1995.news.3B.sum -EXTRA_DIST += reports/2.03.summary -EXTRA_DIST += reports/2.04.summary diff --git a/unlvtests/README.md b/unlvtests/README.md deleted file mode 100644 index 32687f1a..00000000 --- a/unlvtests/README.md +++ /dev/null @@ -1,94 +0,0 @@ -## How to run UNLV tests. - -The scripts in this directory make it possible to duplicate the tests -published in the Fourth Annual Test of OCR Accuracy. -See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf -but first you have to get the tools and data used by UNLV: - -### Step 1: to download the images go to -https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/ -and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz -spn.3B.tar.gz is incorrect in this repo, so get it from code.google - -``` -mkdir -p ~/isri-downloads -cd ~/isri-downloads -curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz -curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz -curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz -curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz -curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz -``` - -### Step 2: extract the files. -It doesn't really matter where -in your filesystem you put them, but they must go under a common -root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example, -~/ISRI-OCRtk. - -``` -mkdir -p ~/ISRI-OCRtk -cd ~/ISRI-OCRtk -tar xzvf ~/isri-downloads/bus.3B.tar.gz -tar xzvf ~/isri-downloads/doe3.3B.tar.gz -tar xzvf ~/isri-downloads/mag.3B.tar.gz -tar xzvf ~/isri-downloads/news.3B.tar.gz -tar xzvf ~/isri-downloads/spn.3B.tar.gz -mkdir -p stopwords -cd stopwords -wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt -``` -Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt -wordacc uses a space delimited stopwords file, not line delimited. -s/\n/ /g - -Edit ~/ISRI-OCRtk/spn.3B/pages -Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717). - -7733_005.3B 3 - -### Step 3: Download the modified ISRI toolkit, make and install the tools : -These will be installed in /usr/local/bin. - -``` -git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git -cd ~/ocr-evaluation-tools -sudo make install -``` - -### Step 4: cd back to your main tesseract-ocr dir and Build tesseract. - -### Step 5: run unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir: - -``` -unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast -``` -and go to the gym, have lunch etc. It takes a while to run. - -### Step 6: There should be a RELEASE.summary file -*unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy -report and comparison with the 1995 results. - -### Step 7: run the test for Spanish. - -``` -unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast -``` - -#### Notes from Nick White regarding wordacc - -If you just want to remove all lines which have 100% recognition, -you can add a 'awk' command like this: - -ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}' -results.txt - -or if you've already got a results file you want to change, you can do this: - -awk '$3 != 100 {print $0}' results.txt newresults.txt - -If you only want the last sections where things are broken down by -word, you can add a sed command, like this: - -ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$ -!d' | awk '$3 != 100 {print $0}' results.txt diff --git a/unlvtests/counttestset.sh b/unlvtests/counttestset.sh deleted file mode 100755 index b3788285..00000000 --- a/unlvtests/counttestset.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# File: counttestset.sh -# Description: Script to count the errors on a single UNLV set. -# Author: Ray Smith -# Created: Wed Jun 13 11:58:01 PDT 2007 -# -# (C) Copyright 2007, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ $# -ne 2 ] -then - echo "Usage:$0 pagesfile langcode" - exit 1 -fi -if [ ! -d src/api ] -then - echo "Run $0 from the tesseract-ocr root directory!" - exit 1 -fi - -pages=$1 -langcode=$2 - -imdir=${pages%/pages} -setname=${imdir##*/} -resdir=unlvtests/results/$setname -mkdir -p unlvtests/reports -echo "Counting on set $setname in directory $imdir to $resdir" -accfiles="" -wafiles="" -while read page dir -do - if [ "$dir" ] - then - srcdir="$imdir/$dir" - else - srcdir="$imdir" - fi -#echo "$srcdir/$page.tif" - # Convert groundtruth and recognized text to UTF-8 to correctly treat accented letters. - iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text" - iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text" - # Count character errors. - ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc" - accfiles="$accfiles $resdir/$page.acc" - # Count word errors. - #langcode should be either eng or spa - if [ "$langcode" = "eng" ] - then - ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" - else - cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords" - ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" - fi - wafiles="$wafiles $resdir/$page.wa" -done <"$pages" - -accsum $accfiles >"unlvtests/results/$setname.characc" -wordaccsum $wafiles >"unlvtests/results/$setname.wordacc" - diff --git a/unlvtests/reorgdata.sh b/unlvtests/reorgdata.sh deleted file mode 100755 index 34ad6d69..00000000 --- a/unlvtests/reorgdata.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ $# -ne 1 ] -then - echo "Usage:$0 scantype" - echo "UNLV data comes in several scan types:" - echo "3B=300 dpi binary" - echo "3A=adaptive thresholded 300 dpi" - echo "3G=300 dpi grey" - echo "4B=400dpi binary" - echo "2B=200dpi binary" - echo "For now we only use 3B" - exit 1 -fi -ext=$1 - -#There are several test sets without meaningful names, so rename -#them with something a bit more meaningful. -#Each s is oldname/newname -for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset -do - old=${s%/*} - #if this set was downloaded then process it. - if [ -r "$old/PAGES" ] - then - new=${s#*/}.$ext - mkdir -p "$new" - echo "Set $old -> $new" - #The pages file had - instead of _ so fix it and add the extension. - for page in $(cat $old/PAGES) - do - echo "${page%-*}_${page#*-}.$ext" - done >"$new/pages" - for f in $(cat "$new/pages") - do - #Put a tif extension on the tif files. - cp "$old/${old}_B/$f" "$new/$f.tif" - #Put a uzn extension on the zone files. - cp "$old/${old}_B/${f}Z" "$new/$f.uzn" - #Cat all the truth files together and put into a single txt file. - cat "$old/${old}_GT/${f%.$ext}".Z* >"$new/$f.txt" - done - fi -done diff --git a/unlvtests/reports/1995.bus.3B.sum b/unlvtests/reports/1995.bus.3B.sum deleted file mode 100644 index 00eb97a8..00000000 --- a/unlvtests/reports/1995.bus.3B.sum +++ /dev/null @@ -1 +0,0 @@ -1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% diff --git a/unlvtests/reports/1995.doe3.3B.sum b/unlvtests/reports/1995.doe3.3B.sum deleted file mode 100644 index 7eb753ae..00000000 --- a/unlvtests/reports/1995.doe3.3B.sum +++ /dev/null @@ -1 +0,0 @@ -1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% diff --git a/unlvtests/reports/1995.mag.3B.sum b/unlvtests/reports/1995.mag.3B.sum deleted file mode 100644 index e718c543..00000000 --- a/unlvtests/reports/1995.mag.3B.sum +++ /dev/null @@ -1 +0,0 @@ -1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% diff --git a/unlvtests/reports/1995.news.3B.sum b/unlvtests/reports/1995.news.3B.sum deleted file mode 100644 index bd0b7c68..00000000 --- a/unlvtests/reports/1995.news.3B.sum +++ /dev/null @@ -1 +0,0 @@ -1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% diff --git a/unlvtests/reports/2.03.summary b/unlvtests/reports/2.03.summary deleted file mode 100644 index 70f9cef8..00000000 --- a/unlvtests/reports/2.03.summary +++ /dev/null @@ -1,9 +0,0 @@ -1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% -1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% -1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% -1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% -2.03 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26% -2.03 doe3.3B 29520 97.98% -18.79% 7966 96.27% 1.79% 6764 95.07 -3.95% -2.03 mag.3B 14568 97.81% -3.16% 4288 96.25% -6.09% 3054 95.47 -9.62% -2.03 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57% -2.03 Total 58165 - -8.81% 15734 - -1.47% 12387 - -6.27% diff --git a/unlvtests/reports/2.04.summary b/unlvtests/reports/2.04.summary deleted file mode 100644 index ed6a10a5..00000000 --- a/unlvtests/reports/2.04.summary +++ /dev/null @@ -1,9 +0,0 @@ -1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% -1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% -1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% -1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% -2.04 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26% -2.04 doe3.3B 29514 97.98% -18.80% 7963 96.27% 1.75% 6762 95.07 -3.98% -2.04 mag.3B 14568 97.81% -3.16% 4289 96.25% -6.07% 3053 95.47 -9.65% -2.04 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57% -2.04 Total 58159 - -8.82% 15732 - -1.48% 12384 - -6.30% diff --git a/unlvtests/reports/4_best_int_spa.summary b/unlvtests/reports/4_best_int_spa.summary deleted file mode 100644 index cbb92073..00000000 --- a/unlvtests/reports/4_best_int_spa.summary +++ /dev/null @@ -1,2 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken -4_best_int_spa spn.3B 2846 99.18% 937 98.39% 739 97.54 6478.02s diff --git a/unlvtests/reports/4_best_spa.summary b/unlvtests/reports/4_best_spa.summary deleted file mode 100644 index 69a7b75d..00000000 --- a/unlvtests/reports/4_best_spa.summary +++ /dev/null @@ -1,2 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken -4_best_spa spn.3B 2823 99.19% 924 98.41% 729 97.57 7233.76s diff --git a/unlvtests/reports/4_fast_eng.summary b/unlvtests/reports/4_fast_eng.summary deleted file mode 100644 index e0325347..00000000 --- a/unlvtests/reports/4_fast_eng.summary +++ /dev/null @@ -1,9 +0,0 @@ -1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% -1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% -1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% -1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% -4_fast_eng bus.3B 6124 98.11% 2.77% 1138 97.88% -30.23% 963 97.05 -25.52% 3935.26s -4_fast_eng doe3.3B 30029 97.96% -17.39% 13781 94.45% 76.09% 13178 92.38 87.13% 18847.36s -4_fast_eng mag.3B 10934 98.37% -27.32% 3343 97.15% -26.78% 2813 96.06 -16.75% 6867.14s -4_fast_eng news.3B 5734 98.84% -10.85% 1322 98.45% -32.07% 1040 97.94 -30.76% 5527.38s -4_fast_eng Total 52821 - -17.19% 19584 - 22.64% 17994 - 36.15% diff --git a/unlvtests/reports/4_fast_spa.summary b/unlvtests/reports/4_fast_spa.summary deleted file mode 100644 index 6d25fe33..00000000 --- a/unlvtests/reports/4_fast_spa.summary +++ /dev/null @@ -1,2 +0,0 @@ -RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken -4_fast_spa spn.3B 2841 99.18% 879 98.49% 742 97.53 3838.82s diff --git a/unlvtests/runalltests.sh b/unlvtests/runalltests.sh deleted file mode 100755 index 628a457e..00000000 --- a/unlvtests/runalltests.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/bin/bash -# File: runalltests.sh -# Description: Script to run a set of UNLV test sets for English. -# Author: Ray Smith -# Created: Thu Jun 14 08:21:01 PDT 2007 -# -# (C) Copyright 2007, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ $# -ne 3 ] -then - echo "Usage:$0 unlv-data-dir version-id tessdata-dir" - exit 1 -fi -if [ ! -d src/api ] -then - echo "Run $0 from the tesseract-ocr root directory!" - exit 1 -fi -if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ] -then - echo "Please build tesseract before running $0" - exit 1 -fi -tessdata=$3 - -#deltapc new old calculates the %change from old to new -deltapc() { -awk ' BEGIN { -printf("%.2f", 100.0*('"$1"'-'"$2"')/'"$2"'); -}' -} - -#timesum computes the total cpu time -timesum() { -awk ' BEGIN { -total = 0.0; -} -{ - total += $2; -} -END { - printf("%.2f\n", total); -}' "$1" -} - -imdir="$1" -vid="$2" -bindir=${0%/*} -if [ "$bindir" = "$0" ] -then - bindir="./" -fi -rdir=unlvtests/reports - -testsets="bus.3B doe3.3B mag.3B news.3B" -#testsets="bus.3B" - -totalerrs=0 -totalwerrs=0 -totalnswerrs=0 -totalolderrs=0 -totaloldwerrs=0 -totaloldnswerrs=0 -for set in $testsets -do - if [ -r "$imdir/$set/pages" ] - then - # Run tesseract on all the pages. - $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng" - # Count the errors on all the pages. - $bindir/counttestset.sh "$imdir/$set/pages" "eng" - # Get the old character word and nonstop word errors. - olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum") - oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum") - oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum") - # Get the new character word and nonstop word errors and accuracy. - cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | - cut -c10-17 |tr -d '[:blank:]') - nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | - cut -c19-26 |tr -d '[:blank:]') - # Compute the percent change. - chdelta=$(deltapc "$cherrs" "$olderrs") - wdelta=$(deltapc "$wderrs" "$oldwerrs") - nswdelta=$(deltapc "$nswderrs" "$oldnswerrs") - sumfile=$rdir/$vid.$set.sum - if [ -r "unlvtests/results/$set.times" ] - then - total_time=$(timesum "unlvtests/results/$set.times") - if [ -r "unlvtests/results/prev/$set.times" ] - then - paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" | - awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta" - fi - else - total_time='0.0' - fi - echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\ - $wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >"$sumfile" - # Sum totals over all the testsets. - let totalerrs=totalerrs+cherrs - let totalwerrs=totalwerrs+wderrs - let totalnswerrs=totalnswerrs+nswderrs - let totalolderrs=totalolderrs+olderrs - let totaloldwerrs=totaloldwerrs+oldwerrs - let totaloldnswerrs=totaloldnswerrs+oldnswerrs - fi -done -# Compute grand total percent change. -chdelta=$(deltapc $totalerrs $totalolderrs) -wdelta=$(deltapc $totalwerrs $totaloldwerrs) -nswdelta=$(deltapc $totalnswerrs $totaloldnswerrs) -tfile=$rdir/$vid.total.sum -echo "$vid Total $totalerrs - $chdelta% $totalwerrs\ - - $wdelta% $totalnswerrs - $nswdelta%" >"$tfile" -cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary - -mv "$rdir/$vid".*.sum unlvtests/results/ -cat "$rdir/$vid".summary diff --git a/unlvtests/runalltests_spa.sh b/unlvtests/runalltests_spa.sh deleted file mode 100755 index a6e218bb..00000000 --- a/unlvtests/runalltests_spa.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash -############################################################################## -# File: runalltests_spa.sh -# Description: Script to run a set of UNLV test sets for Spanish. -# based on runalltests.sh by Ray Smith -# Author: Shree Devi Kumar -# Created: June 09, 2018 -# -# (C) Copyright 2007, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -############################################################################## -if [ $# -ne 3 ] -then - echo "Usage:$0 unlv-data-dir version-id tessdata-dir" - exit 1 -fi -if [ ! -d src/api ] -then - echo "Run $0 from the tesseract-ocr root directory!" - exit 1 -fi -if [ ! -r src/api/tesseract ] && [ ! -r tesseract.exe ] -then - echo "Please build tesseract before running $0" - exit 1 -fi -tessdata=$3 -lang=$4 - -#timesum computes the total cpu time -timesum() { -awk ' BEGIN { -total = 0.0; -} -{ - total += $2; -} -END { - printf("%.2f\n", total); -}' "$1" -} - -imdir="$1" -vid="$2" -bindir=${0%/*} -if [ "$bindir" = "$0" ] -then - bindir="./" -fi -rdir=unlvtests/reports - -testsets="spn.3B" - -totalerrs=0 -totalwerrs=0 -totalnswerrs=0 -for set in $testsets -do - if [ -r "$imdir/$set/pages" ] - then - # Run tesseract on all the pages. - $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa" - # Count the errors on all the pages. - $bindir/counttestset.sh "$imdir/$set/pages" "spa" - # Get the new character word and nonstop word errors and accuracy. - cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | - tr -d '[:blank:]') - nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | - cut -c10-17 |tr -d '[:blank:]') - nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | - cut -c19-26 |tr -d '[:blank:]') - -sumfile=$rdir/$vid.$set.sum - if [ -r "unlvtests/results/$set.times" ] - then - total_time=$(timesum "unlvtests/results/$set.times") - if [ -r "unlvtests/results/prev/$set.times" ] - then - paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" | - awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta" - fi - else - total_time='0.0' - fi - echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\ - NonStopWordErrors Accuracy TimeTaken">"$sumfile" - echo "$vid $set $cherrs $chacc $wderrs $wdacc\ - $nswderrs $nswdacc ${total_time}s" >>"$sumfile" - fi -done - -cat "$rdir/$vid".*.sum >"$rdir/$vid".summary - -mv "$rdir/$vid".*.sum unlvtests/results/ -cat "$rdir/$vid".summary diff --git a/unlvtests/runtestset.sh b/unlvtests/runtestset.sh deleted file mode 100755 index 783f0bfb..00000000 --- a/unlvtests/runtestset.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -# File: runtestset.sh -# Description: Script to run tesseract on a single UNLV set. -# Author: Ray Smith -# Created: Wed Jun 13 10:13:01 PDT 2007 -# -# (C) Copyright 2007, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ $# -ne 3 ] && [ $# -ne 4 ] -then - echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]" - exit 1 -fi -if [ ! -d src/api ] -then - echo "Run $0 from the tesseract-ocr root directory!" - exit 1 -fi -if [ ! -r src/api/tesseract ] -then - if [ ! -r tesseract.exe ] - then - echo "Please build tesseract before running $0" - exit 1 - else - tess="./tesseract.exe" - fi -else - tess="time -f %U -o times.txt src/api/tesseract" - #tess="time -f %U -o times.txt tesseract" -fi - -tessdata=$2 -lang=$3 -pages=$1 -imdir=${pages%/pages} -setname=${imdir##*/} -if [ $# -eq 4 ] && [ "$4" = "-zoning" ] -then - config=unlv.auto - resdir=unlvtests/results/zoning.$setname -else - config=unlv - resdir=unlvtests/results/$setname -fi -echo -e "Testing on set $setname in directory $imdir to $resdir\n" -mkdir -p "$resdir" -rm -f "unlvtests/results/$setname.times" -while read page dir -do - # A pages file may be a list of files with subdirs or maybe just - # a plain list of files so accommodate both. - if [ "$dir" ] - then - srcdir="$imdir/$dir" - else - srcdir="$imdir" - fi -# echo "$srcdir/$page.tif" - $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" - if [ -r times.txt ] - then - read t >"unlvtests/results/$setname.times" - echo -e "\033M$page $t" - if [ "$t" = "Command terminated by signal 2" ] - then - exit 0 - fi - fi -done <"$pages"