Merge pull request #1658 from Shreeshrii/master

correct script for eng, remove new reports from distribution
This commit is contained in:
Egor Pugin 2018-06-10 14:15:51 +03:00 committed by GitHub
commit e5d11b5297
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 19 additions and 12 deletions

10
.gitignore vendored
View File

@ -64,6 +64,8 @@ build/*
/bin
*/.deps/*
*/.libs/*
*/*/.deps/*
*/*/.libs/*
*.lo
*.la
*.o
@ -102,9 +104,7 @@ kernel*.bin
/unittest/osd_test
/unittest/tesseracttests
# ocreval tool and generated files from unlvtests
/unlvtests/ocreval*
# generated files from unlvtests
times.txt
/unlvtests/results*
/unlvtests/reports/*.characc
/unlvtests/reports/*.times
/unlvtests/reports/*.wordacc

View File

@ -1,8 +1,8 @@
EXTRA_DIST = README.md
EXTRA_DIST += counttestset.sh
EXTRA_DIST += reorgdata.sh
EXTRA_DIST += runalltests.sh
EXTRA_DIST += runalltests_spa.sh
EXTRA_DIST += runtestset.sh
EXTRA_DIST += reports/1995.bus.3B.sum
EXTRA_DIST += reports/1995.doe3.3B.sum
@ -10,6 +10,3 @@ EXTRA_DIST += reports/1995.mag.3B.sum
EXTRA_DIST += reports/1995.news.3B.sum
EXTRA_DIST += reports/2.03.summary
EXTRA_DIST += reports/2.04.summary
EXTRA_DIST += reports/4_best_spa.summary
EXTRA_DIST += reports/4_best_int_spa.summary
EXTRA_DIST += reports/4_fast_spa.summary

View File

@ -45,9 +45,10 @@ do
srcdir="$imdir"
fi
#echo "$srcdir/$page.tif"
# Count character errors.
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
# Convert groundtruth and recognized text to UTF-8 to correctly treat accented letters.
iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
# Count character errors.
ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.

View File

@ -0,0 +1,9 @@
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
4_fast_eng bus.3B 6124 98.11% 2.77% 1138 97.88% -30.23% 963 97.05 -25.52% 3935.26s
4_fast_eng doe3.3B 30029 97.96% -17.39% 13781 94.45% 76.09% 13178 92.38 87.13% 18847.36s
4_fast_eng mag.3B 10934 98.37% -27.32% 3343 97.15% -26.78% 2813 96.06 -16.75% 6867.14s
4_fast_eng news.3B 5734 98.84% -10.85% 1322 98.45% -32.07% 1040 97.94 -30.76% 5527.38s
4_fast_eng Total 52821 - -17.19% 19584 - 22.64% 17994 - 36.15%

View File

@ -77,7 +77,7 @@ do
# Run tesseract on all the pages.
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
# Count the errors on all the pages.
$bindir/counttestset.sh "$imdir/$set/pages"
$bindir/counttestset.sh "$imdir/$set/pages" "eng"
# Get the old character word and nonstop word errors.
olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum")
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")