mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
correct script for eng, remove new reports from distribution
This commit is contained in:
parent
a4241c9817
commit
4753d8cbb6
@ -1,8 +1,8 @@
|
||||
|
||||
EXTRA_DIST = README.md
|
||||
EXTRA_DIST += counttestset.sh
|
||||
EXTRA_DIST += reorgdata.sh
|
||||
EXTRA_DIST += runalltests.sh
|
||||
EXTRA_DIST += runalltests_spa.sh
|
||||
EXTRA_DIST += runtestset.sh
|
||||
EXTRA_DIST += reports/1995.bus.3B.sum
|
||||
EXTRA_DIST += reports/1995.doe3.3B.sum
|
||||
@ -10,6 +10,3 @@ EXTRA_DIST += reports/1995.mag.3B.sum
|
||||
EXTRA_DIST += reports/1995.news.3B.sum
|
||||
EXTRA_DIST += reports/2.03.summary
|
||||
EXTRA_DIST += reports/2.04.summary
|
||||
EXTRA_DIST += reports/4_best_spa.summary
|
||||
EXTRA_DIST += reports/4_best_int_spa.summary
|
||||
EXTRA_DIST += reports/4_fast_spa.summary
|
||||
|
@ -45,9 +45,10 @@ do
|
||||
srcdir="$imdir"
|
||||
fi
|
||||
#echo "$srcdir/$page.tif"
|
||||
# Count character errors.
|
||||
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
|
||||
# Convert groundtruth and recognized text to UTF-8 to correctly treat accented letters.
|
||||
iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text"
|
||||
iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text"
|
||||
# Count character errors.
|
||||
ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc"
|
||||
accfiles="$accfiles $resdir/$page.acc"
|
||||
# Count word errors.
|
||||
|
9
unlvtests/reports/4_fast_eng.summary
Normal file
9
unlvtests/reports/4_fast_eng.summary
Normal file
@ -0,0 +1,9 @@
|
||||
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
|
||||
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
|
||||
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
|
||||
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
|
||||
4_fast_eng bus.3B 6124 98.11% 2.77% 1138 97.88% -30.23% 963 97.05 -25.52% 3935.26s
|
||||
4_fast_eng doe3.3B 30029 97.96% -17.39% 13781 94.45% 76.09% 13178 92.38 87.13% 18847.36s
|
||||
4_fast_eng mag.3B 10934 98.37% -27.32% 3343 97.15% -26.78% 2813 96.06 -16.75% 6867.14s
|
||||
4_fast_eng news.3B 5734 98.84% -10.85% 1322 98.45% -32.07% 1040 97.94 -30.76% 5527.38s
|
||||
4_fast_eng Total 52821 - -17.19% 19584 - 22.64% 17994 - 36.15%
|
@ -77,7 +77,7 @@ do
|
||||
# Run tesseract on all the pages.
|
||||
$bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng"
|
||||
# Count the errors on all the pages.
|
||||
$bindir/counttestset.sh "$imdir/$set/pages"
|
||||
$bindir/counttestset.sh "$imdir/$set/pages" "eng"
|
||||
# Get the old character word and nonstop word errors.
|
||||
olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum")
|
||||
oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum")
|
||||
|
Loading…
Reference in New Issue
Block a user