remove outdated scripts/contrib dir

2025-01-18 06:30:14 +08:00 · 2018-09-22 23:29:34 +02:00 · 2018-09-22 23:29:34 +02:00 · c003a60410
commit c003a60410
parent 01cf7402df
4 changed files with 0 additions and 412 deletions
--- a/contrib/genlangdata.pl
+++ b/contrib/genlangdata.pl
@ -1,264 +0,0 @@
-#!/usr/bin/perl
-
-use warnings;
-use strict;
-use utf8;
-
-use Getopt::Std;
-
-=pod
-
-=head1 NAME
-
-genwordlists.pl - generate word lists for Tesseract
-
-=head1 SYNOPSIS
-
-genwordlists.pl -i large_text_file -d outdir -p lang
-
-=head1 DESCRIPTION
-
-    genwordlists.pl -i large_text_file -d outdir -p lang
-
-Creates 4 files in C<outdir>: F<lang.word.bigrams.unsorted>,
-F<lang.word.numbers.unsorted>, F<lang.word.punc.unsorted>, and
-F<lang.wordlist.unsorted>, which (when sorted) can be used with
-C<wordlist2dawg> for Tesseract's language data.
-
-The script can also run as a filter. Given a set of files created
-by WikiExtractor (L<http://medialab.di.unipi.it/Project/SemaWiki/Tools/WikiExtractor.py>),
-use:
-
-    find WikiExtractor -type f | while read i; do \
-    pfx=$(echo $i|tr '/' '_'); cat $i | \
-    perl genwordlists.pl -d OUTDIR -p $pfx; done
-
-This will create a set of output files to match each of the files
-WikiExtractor created.
-
-To combine these files:
-
-    for i in word.bigrams.unsorted word.numbers.unsorted \
-    word.punc.unsorted wordlist.unsorted; do \
-    find OUTDIR -name "*$i" -exec cat '{}' \; |\
-    perl -CS -ane 'BEGIN{my %c=();} chomp;
-    my($a,$b)=split/\t/;if(defined $c{$a}){$c{$a}+=$b}
-    else {$c{$a} = $b;} END{while(my($k,$v)=each %c)
-    {print "$v\t$k\n";}}'|sort -nr > tmp.$i ;done
-
-Followed by:
-
-    for i in word.punc.unsorted word.bigrams.unsorted \
-    word.numbers.unsorted;do cat tmp.$i \
-    awk -F'\t' '{print $2 "\t" $1}' > real.$i ; done
-    cat tmp.wordlist.unsorted | awk -F'\t' '{print $2}' \
-    > real.wordlist.unsorted
-
-Note that, although the langdata repository contains the
-counts of each item in most of the punctuation, number, and
-bigram files, these files must be filtered to only contain
-the first column, otherwise C<wordlist2dawg> will fail to write
-the output file.
-
-=head1 CAVEATS
-
-The format of the output files, and how the data are extracted,
-is based only on staring at the input files and taking a guess.
-They may be wildly inaccurate.
-
-The only part I can say for certain is correct is that digits
-are replaced with '?' in the .numbers wordlist. (See F<dict/dict.cpp>
-in the Tesseract source).
-
-=head1 COPYRIGHT
-
-Copyright 2014 Jim O'Regan
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-L<http://www.apache.org/licenses/LICENSE-2.0>
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-=head1 SEE ALSO
-
-L<wordlist2dawg(1)>
-
-=cut
-
-# I haven't looked into this too much
-my %lig = (
-	# Longest first
-	'ffi' => 'ﬃ',
-	'ct' => "\N{U+E003}",
-	'ff' => 'ﬀ',
-	'fi' => 'ﬁ',
-	'fl' => 'ﬂ',
-	'st' => 'ﬆ',
-);
-
-my %punct;
-my %num;
-my %bigrams;
-my %opts;
-my %words;
-
-my $do_ligatures = 0;
-
-getopts("hli:p:d:", \%opts);
-
-if (defined $opts{h}) {
-	print "Usage: genwordlists [options]\n";
-	print "-h\tPrints a brief help message\n";
-	print "-d\tSet the output directory (default is current)\n";
-	print "-b\tSet the prefix for the language data (e.g., eng for English)\n";
-	print "-l\tProcess ligatures\n";
-	print "-i\tSet the input file. If not set, reads from stdin\n";
-	exit;
-}
-
-if (defined $opts{l}) {
-	$do_ligatures = 1;
-}
-
-my $prefix = '';
-if (!defined $opts{p}) {
-	print "Prefix (-p) must be set!\n";
-	exit;
-} else {
-	if (defined $opts{d}) {
-		$prefix = $opts{d};
-		$prefix =~ s/\/$//;
-		$prefix .= '/';
-	}
-	$prefix .= $opts{p};
-	# Easiest is to drop it, if present, and readd
-	$prefix =~ s/\.$//;
-	$prefix .= ".";
-}
-
-my $input;
-if (defined $opts{i}) {
-	open ($input, "<", $opts{i}) or die $!;
-#} elsif ($#ARGV > 0) {
-#	open ($input, "<", $ARGV[0]) or die $!;
-} else {
-	$input = *STDIN;
-}
-binmode $input, ":utf8";
-
-while (<$input>) {
-	chomp;
-	tr/\t/ /;
-
-	next if (/^<doc/);
-	next if (/^<\/doc/);
-	next if (/^$/);
-	next if (/^[ \t]*$/);
-	next if (/^\]\]$/);
-
-	my @punct = $_ =~ /([ \p{Punct}]*)/g;
-	for my $i (@punct) {
-		if(defined($punct{$i})) {
-			$punct{$i}++;
-		} else {
-			$punct{$i} = 1;
-		}
-	}
-	my @rawnumtok = split(/ /);
-	my @numtok = map { local $_ = $_; s/[0-9]/ /g; $_ } grep(/[0-9]/, @rawnumtok);
-	for my $i (@numtok) {
-		if(defined($num{$i})) {
-			$num{$i}++;
-		} else {
-			$num{$i} = 1;
-		}
-	}
-
-	my @bitoksraw = map { local $_ = $_; s/[0-9]/?/g; $_ } split(/ |[ \p{Punct}][ \p{Punct}]+/);
-	if ($#bitoksraw > 0) {
-		my @first = @bitoksraw;
-		my $discard = shift @bitoksraw;
-		for (my $j = 0; $j != $#first; $j++) {
-			if ($bitoksraw[$j] ne '' && $first[$j] ne '') {
-				my $tok = $first[$j] . " " . $bitoksraw[$j];
-				#Not keeping count of these, but this can be useful for trimming
-				if(defined($bigrams{$tok})) {
-					$bigrams{$tok}++;
-				} else {
-					$bigrams{$tok} = 1;
-				}
-				if($do_ligatures == 1) {
-					my $other = do_lig($tok);
-					if ($other ne $tok) {
-						if(defined($bigrams{$other})) {
-							$bigrams{$other}++;
-						} else {
-							$bigrams{$other} = 1;
-						}
-					}
-				}
-			}
-		}
-	}
-	my @wordl = grep { !/[0-9 \p{Punct}]/ } split (/[ \p{Punct}]+/);
-	if ($#wordl >= 0) {
-		for my $word (@wordl) {
-			if (defined $words{$word}) {
-				$words{$word}++;
-			} else {
-				$words{$word} = 1;
-			}
-		}
-	}
-}
-
-if (defined $opts{i}) {
-	close $input;
-}
-
-open(BIGRAMS, ">", "${prefix}word.bigrams.unsorted");
-binmode BIGRAMS, ":utf8";
-while (my($k, $v) = each %bigrams) {
-	print BIGRAMS "$k\t$v\n";
-}
-close BIGRAMS;
-%bigrams = ();
-
-open(PUNCT, ">", "${prefix}word.punc.unsorted");
-binmode PUNCT, ":utf8";
-while (my($k, $v) = each %punct) {
-	print PUNCT "$k\t$v\n";
-}
-close PUNCT;
-%punct = ();
-
-open(NUMS, ">", "${prefix}word.numbers.unsorted");
-binmode NUMS, ":utf8";
-while (my($k, $v) = each %num) {
-	print NUMS "$k\t$v\n";
-}
-close NUMS;
-%num = ();
-
-open(WORDS, ">", "${prefix}wordlist.unsorted");
-binmode WORDS, ":utf8";
-while (my($k, $v) = each %words) {
-	print WORDS "$k\t$v\n";
-}
-close WORDS;
-%words = ();
-
-sub do_lig {
-	my $word = shift;
-	while (my($k, $v) = each %lig) {
-		$word =~ s/$k/$v/g;
-	}
-	$word;
-}
--- a/contrib/tesseract-c_api-demo.py
+++ b/contrib/tesseract-c_api-demo.py
@ -1,74 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-# Copyright 2012 Zdenko Podobný
-# Author: Zdenko Podobný
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Simple python demo script of tesseract-ocr 3.02 c-api
-"""
-
-import os
-import sys
-import ctypes
-
-# Demo variables
-lang = "eng"
-filename = "../phototest.tif"
-libpath = "/usr/local/lib64/"
-libpath_w = "../vs2010/DLL_Release/"
-TESSDATA_PREFIX = os.environ.get('TESSDATA_PREFIX')
-if not TESSDATA_PREFIX:
-    TESSDATA_PREFIX = "../"
-
-if sys.platform == "win32":
-	libname = libpath_w + "libtesseract302.dll"
-	libname_alt = "libtesseract302.dll"
-	os.environ["PATH"] += os.pathsep + libpath_w
-else:
-	libname = libpath + "libtesseract.so.3.0.2"
-	libname_alt = "libtesseract.so.3"
-
-try:
-	tesseract = ctypes.cdll.LoadLibrary(libname)
-except:
-	try:
-		tesseract = ctypes.cdll.LoadLibrary(libname_alt)
-	except WindowsError, err:
-		print("Trying to load '%s'..." % libname)
-		print("Trying to load '%s'..." % libname_alt)
-		print(err)
-		exit(1)
-
-tesseract.TessVersion.restype = ctypes.c_char_p
-tesseract_version = tesseract.TessVersion()[:4]
-
-# We need to check library version because libtesseract.so.3 is symlink
-# and can point to other version than 3.02
-if float(tesseract_version) < 3.02:
-	print("Found tesseract-ocr library version %s." % tesseract_version)
-	print("C-API is present only in version 3.02!")
-	exit(2)
-
-api = tesseract.TessBaseAPICreate()
-rc = tesseract.TessBaseAPIInit3(api, TESSDATA_PREFIX, lang);
-if (rc):
-	tesseract.TessBaseAPIDelete(api)
-	print("Could not initialize tesseract.\n")
-	exit(3)
-
-text_out = tesseract.TessBaseAPIProcessPages(api, filename, None , 0);
-result_text = ctypes.string_at(text_out)
-print result_text
--- a/contrib/tesseract.completion
+++ b/contrib/tesseract.completion
@ -1,39 +0,0 @@
-#-*- mode: shell-script;-*-
-#
-# bash completion support for tesseract
-#
-# Copyright (C) 2009 Neskie A. Manuel <neskiem@gmail.com>
-# Distributed under the Apache License, Version 2.0.
-#
-
-_tesseract_languages()
-{
-	local TESSDATA="/usr/share/tesseract-ocr/tessdata/"
-	local langs="$(ls $TESSDATA | grep traineddata | cut -d \. -f 1)"
-
-	COMPREPLY=(${COMPREPLY[@]:-} $(compgen -W "$langs" -- "$cur") )
-}
-
-_tesseract()
-{
-	local cur prev
-        COMPREPLY=()
-        cur="$2"
-        prev="$3"
-
-	case "$prev" in
-		tesseract)
-			COMPREPLY=($(compgen -f -X "!*.+(tif)" -- "$cur") )
-		;;
-		*.tif)
-			COMPREPLY=($(compgen -W "$(basename  $prev .tif)" ) )
-		;;
-		-l)
-			_tesseract_languages
-		;;
-		*)
-			COMPREPLY=($(compgen -W "-l" ) )
-		;;
-    esac
-}
-complete -F _tesseract -o nospace tesseract
--- a/contrib/traineddata.txt
+++ b/contrib/traineddata.txt
@ -1,35 +0,0 @@
-bul	Bulgarian
-cat	Catalan
-ces	Czech
-chi_sim	Simplified Chinese
-chi_tra	Traditional Chinese
-dan-frak	Danish (Fraktur)
-dan	Danish
-deu	German
-ell	Greek
-eng	English
-fin	Finnish
-fra	French
-hun	Hungarian
-ind	Indonesian
-ita	Italian
-jpn	Japanese
-kor	Korean
-lav	Latvian
-lit	Lithuanian
-nld	Dutch
-nor	Norwegian
-pol	Polish
-por	Portuguese
-ron	Romanian
-rus	Russian
-slk	Slovakian
-slv	Slovenian
-spa	Spanish
-srp	Serbian
-swe	Swedish
-tgl	Tagalog
-tha	Thai
-tur	Turkish
-ukr	Ukrainian
-vie	Vietnamese