mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-12 07:29:07 +08:00
25 lines
842 B
Bash
25 lines
842 B
Bash
#!/bin/bash
|
|
#
|
|
mkdir -p ~/lang-downloads
|
|
cd ~/lang-downloads
|
|
wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip
|
|
wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip
|
|
|
|
mkdir -p ~/lang-files
|
|
cd ~/lang-files
|
|
unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk
|
|
unzip ~/lang-downloads/frk-stweil-gt.zip -d frk
|
|
mkdir -p ./frk-ligatures
|
|
cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/
|
|
cp ./frk/gt/*.txt ./frk-ligatures/
|
|
|
|
cd ./frk-ligatures/
|
|
ls -1 *.tif >pages
|
|
sed -i -e 's/.tif//g' pages
|
|
|
|
mkdir -p ~/lang-stopwords
|
|
cd ~/lang-stopwords
|
|
wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
|
|
|
|
echo "Edit ~/lang-files/stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited."
|