mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-23 19:13:00 +08:00
4550ba71fb
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@779 d0cd1f9f-072b-0410-8dd7-cf729c803f20
506 lines
17 KiB
Python
506 lines
17 KiB
Python
#!/usr/bin/env python
|
|
|
|
"""
|
|
tesshelper.py -- Utility operations to compare, report stats, and copy
|
|
public headers for tesseract 3.0x VS2008 Project
|
|
|
|
$RCSfile: tesshelper.py,v $ $Revision: 7ca575b377aa $ $Date: 2012/03/07 17:26:31 $
|
|
"""
|
|
|
|
r"""
|
|
Requires:
|
|
|
|
python 2.7 or greater: activestate.com
|
|
http://www.activestate.com/activepython/downloads
|
|
|
|
because using the new argparse module and new literal set syntax (s={1, 2}) .
|
|
|
|
General Notes:
|
|
--------------
|
|
|
|
Format for a .vcproj file entry:
|
|
|
|
<File
|
|
RelativePath="..\src\allheaders.h"
|
|
>
|
|
</File>
|
|
|
|
"""
|
|
|
|
epilogStr = r"""
|
|
Examples:
|
|
|
|
Assume that tesshelper.py is in c:\buildfolder\tesseract-3.02\vs2008,
|
|
which is also the current directory. Then,
|
|
|
|
python tesshelper .. compare
|
|
|
|
will compare c:\buildfolder\tesseract-3.02 "library" directories to the
|
|
libtesseract Project
|
|
(c:\buildfolder\tesseract-3.02\vs2008\libtesseract\libtesseract.vcproj).
|
|
|
|
python tesshelper .. report
|
|
|
|
will display summary stats for c:\buildfolder\tesseract-3.02 "library"
|
|
directories and the libtesseract Project.
|
|
|
|
python tesshelper .. copy ..\..\include
|
|
|
|
will copy all "public" libtesseract header files to
|
|
c:\buildfolder\include.
|
|
|
|
python tesshelper .. clean
|
|
|
|
will clean the vs2008 folder of all build directories, and .user, .suo,
|
|
.ncb, and other temp files.
|
|
|
|
"""
|
|
|
|
# imports of python standard library modules
|
|
# See Python Documentation | Library Reference for details
|
|
import collections
|
|
import glob
|
|
import argparse
|
|
import os
|
|
import re
|
|
import shutil
|
|
import sys
|
|
|
|
# ====================================================================
|
|
|
|
VERSION = "1.0 %s" % "$Date: 2012/03/07 17:26:31 $".split()[1]
|
|
PROJ_SUBDIR = r"vs2008\libtesseract"
|
|
PROJFILE = "libtesseract.vcproj"
|
|
|
|
NEWHEADERS_FILENAME = "newheaders.txt"
|
|
NEWSOURCES_FILENAME = "newsources.txt"
|
|
|
|
fileNodeTemplate = \
|
|
''' <File
|
|
RelativePath="..\..\%s"
|
|
>
|
|
</File>
|
|
'''
|
|
|
|
# ====================================================================
|
|
|
|
def getProjectfiles(libTessDir, libProjectFile, nTrimChars):
|
|
"""Return sets of all, c, h, and resources files in libtesseract Project"""
|
|
|
|
#extract filenames of header & source files from the .vcproj
|
|
projectCFiles = set()
|
|
projectHFiles = set()
|
|
projectRFiles = set()
|
|
projectFilesSet = set()
|
|
f = open(libProjectFile, "r")
|
|
data = f.read()
|
|
f.close()
|
|
|
|
projectFiles = re.findall(r'(?i)RelativePath="(\.[^"]+)"', data)
|
|
for projectFile in projectFiles:
|
|
root, ext = os.path.splitext(projectFile.lower())
|
|
if ext == ".c" or ext == ".cpp":
|
|
projectCFiles.add(projectFile)
|
|
elif ext == ".h":
|
|
projectHFiles.add(projectFile)
|
|
elif ext == ".rc":
|
|
projectRFiles.add(projectFile)
|
|
else:
|
|
print "unknown file type: %s" % projectFile
|
|
|
|
relativePath = os.path.join(libTessDir, projectFile)
|
|
relativePath = os.path.abspath(relativePath)
|
|
relativePath = relativePath[nTrimChars:].lower()
|
|
projectFilesSet.add(relativePath)
|
|
|
|
return projectFilesSet, projectHFiles, projectCFiles, projectRFiles
|
|
|
|
def getTessLibFiles(tessDir, nTrimChars):
|
|
"""Return set of all libtesseract files in tessDir"""
|
|
|
|
libDirs = [
|
|
"api",
|
|
"ccmain",
|
|
"ccstruct",
|
|
"ccutil",
|
|
"classify",
|
|
"cube",
|
|
"cutil",
|
|
"dict",
|
|
"image",
|
|
r"neural_networks\runtime",
|
|
"textord",
|
|
"viewer",
|
|
"wordrec",
|
|
#"training",
|
|
r"vs2008\port",
|
|
r"vs2008\libtesseract",
|
|
]
|
|
|
|
#create list of all .h, .c, .cpp files in "library" directories
|
|
tessFiles = set()
|
|
for curDir in libDirs:
|
|
baseDir = os.path.join(tessDir, curDir)
|
|
for filetype in ["*.c", "*.cpp", "*.h", "*.rc"]:
|
|
pattern = os.path.join(baseDir, filetype)
|
|
fileList = glob.glob(pattern)
|
|
for curFile in fileList:
|
|
curFile = os.path.abspath(curFile)
|
|
relativePath = curFile[nTrimChars:].lower()
|
|
tessFiles.add(relativePath)
|
|
|
|
return tessFiles
|
|
|
|
# ====================================================================
|
|
|
|
def tessCompare(tessDir):
|
|
'''Compare libtesseract Project files and actual "sub-library" files.'''
|
|
|
|
vs2008Dir = os.path.join(tessDir, "vs2008")
|
|
libTessDir = os.path.join(vs2008Dir, "libtesseract")
|
|
libProjectFile = os.path.join(libTessDir,"libtesseract.vcproj")
|
|
tessAbsDir = os.path.abspath(tessDir)
|
|
nTrimChars = len(tessAbsDir)+1
|
|
print 'Comparing VS2008 Project "%s" with\n "%s"' % (libProjectFile,
|
|
tessAbsDir)
|
|
|
|
projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \
|
|
getProjectfiles(libTessDir, libProjectFile, nTrimChars)
|
|
tessFiles = getTessLibFiles(tessDir, nTrimChars)
|
|
|
|
extraFiles = tessFiles - projectFilesSet
|
|
print "%2d Extra files (in %s but not in Project)" % (len(extraFiles),
|
|
tessAbsDir)
|
|
headerFiles = []
|
|
sourceFiles = []
|
|
sortedList = list(extraFiles)
|
|
sortedList.sort()
|
|
for filename in sortedList:
|
|
root, ext = os.path.splitext(filename.lower())
|
|
if ext == ".h":
|
|
headerFiles.append(filename)
|
|
else:
|
|
sourceFiles.append(filename)
|
|
print " %s " % filename
|
|
|
|
print
|
|
print "%2d new header file items written to %s" % (len(headerFiles),
|
|
NEWHEADERS_FILENAME)
|
|
headerFiles.sort()
|
|
with open(NEWHEADERS_FILENAME, "w") as f:
|
|
for filename in headerFiles:
|
|
f.write(fileNodeTemplate % filename)
|
|
|
|
print "%2d new source file items written to %s" % (len(sourceFiles),
|
|
NEWSOURCES_FILENAME)
|
|
sourceFiles.sort()
|
|
with open(NEWSOURCES_FILENAME, "w") as f:
|
|
for filename in sourceFiles:
|
|
f.write(fileNodeTemplate % filename)
|
|
print
|
|
|
|
deadFiles = projectFilesSet - tessFiles
|
|
print "%2d Dead files (in Project but not in %s" % (len(deadFiles),
|
|
tessAbsDir)
|
|
sortedList = list(deadFiles)
|
|
sortedList.sort()
|
|
for filename in sortedList:
|
|
print " %s " % filename
|
|
|
|
# ====================================================================
|
|
|
|
def tessReport(tessDir):
|
|
"""Report summary stats on "sub-library" files and libtesseract Project file."""
|
|
|
|
vs2008Dir = os.path.join(tessDir, "vs2008")
|
|
libTessDir = os.path.join(vs2008Dir, "libtesseract")
|
|
libProjectFile = os.path.join(libTessDir,"libtesseract.vcproj")
|
|
tessAbsDir = os.path.abspath(tessDir)
|
|
nTrimChars = len(tessAbsDir)+1
|
|
|
|
projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \
|
|
getProjectfiles(libTessDir, libProjectFile, nTrimChars)
|
|
tessFiles = getTessLibFiles(tessDir, nTrimChars)
|
|
|
|
print 'Summary stats for "%s" library directories' % tessAbsDir
|
|
folderCounters = {}
|
|
for tessFile in tessFiles:
|
|
tessFile = tessFile.lower()
|
|
folder, head = os.path.split(tessFile)
|
|
file, ext = os.path.splitext(head)
|
|
typeCounter = folderCounters.setdefault(folder, collections.Counter())
|
|
typeCounter[ext[1:]] += 1
|
|
|
|
folders = folderCounters.keys()
|
|
folders.sort()
|
|
totalFiles = 0
|
|
totalH = 0
|
|
totalCPP = 0
|
|
totalOther = 0
|
|
|
|
print
|
|
print " total h cpp"
|
|
print " ----- --- ---"
|
|
for folder in folders:
|
|
counters = folderCounters[folder]
|
|
nHFiles = counters['h']
|
|
nCPPFiles = counters['cpp']
|
|
|
|
total = nHFiles + nCPPFiles
|
|
totalFiles += total
|
|
totalH += nHFiles
|
|
totalCPP += nCPPFiles
|
|
|
|
print " %5d %3d %3d %s" % (total, nHFiles, nCPPFiles, folder)
|
|
print " ----- --- ---"
|
|
print " %5d %3d %3d" % (totalFiles, totalH, totalCPP)
|
|
|
|
print
|
|
print 'Summary stats for VS2008 Project "%s"' % libProjectFile
|
|
print " %5d %s" %(len(projectHFiles), "Header files")
|
|
print " %5d %s" % (len(projectCFiles), "Source files")
|
|
print " %5d %s" % (len(projectRFiles), "Resource files")
|
|
print " -----"
|
|
print " %5d" % (len(projectHFiles) + len(projectCFiles) + len(projectRFiles), )
|
|
|
|
# ====================================================================
|
|
|
|
def copyIncludes(fileSet, description, tessDir, includeDir):
|
|
"""Copy set of files to specified include dir."""
|
|
|
|
print
|
|
print 'Copying libtesseract "%s" headers to %s' % (description, includeDir)
|
|
print
|
|
|
|
sortedList = list(fileSet)
|
|
sortedList.sort()
|
|
|
|
count = 0
|
|
errList = []
|
|
for includeFile in sortedList:
|
|
filepath = os.path.join(tessDir, includeFile)
|
|
if os.path.isfile(filepath):
|
|
shutil.copy2(filepath, includeDir)
|
|
print "Copied: %s" % includeFile
|
|
count += 1
|
|
else:
|
|
print '***Error: "%s" doesn\'t exist"' % filepath
|
|
errList.append(filepath)
|
|
|
|
print '%d header files successfully copied to "%s"' % (count, includeDir)
|
|
if len(errList):
|
|
print "The following %d files were not copied:"
|
|
for filepath in errList:
|
|
print " %s" % filepath
|
|
|
|
def tessCopy(tessDir, includeDir):
|
|
'''Copy all "public" libtesseract Project header files to include directory.
|
|
|
|
Preserves directory hierarchy.'''
|
|
|
|
baseIncludeSet = {
|
|
r"api\baseapi.h",
|
|
r"api\capi.h",
|
|
r"api\apitypes.h",
|
|
r"ccstruct\publictypes.h",
|
|
r"ccmain\thresholder.h",
|
|
r"ccutil\host.h",
|
|
r"ccutil\basedir.h",
|
|
r"ccutil\tesscallback.h",
|
|
r"ccutil\unichar.h",
|
|
r"ccutil\platform.h",
|
|
}
|
|
|
|
strngIncludeSet = {
|
|
r"ccutil\strngs.h",
|
|
r"ccutil\memry.h",
|
|
r"ccutil\host.h",
|
|
r"ccutil\serialis.h",
|
|
r"ccutil\errcode.h",
|
|
r"ccutil\fileerr.h",
|
|
#r"ccutil\genericvector.h",
|
|
}
|
|
|
|
resultIteratorIncludeSet = {
|
|
r"ccmain\ltrresultiterator.h",
|
|
r"ccmain\pageiterator.h",
|
|
r"ccmain\resultiterator.h",
|
|
r"ccutil\genericvector.h",
|
|
r"ccutil\tesscallback.h",
|
|
r"ccutil\errcode.h",
|
|
r"ccutil\host.h",
|
|
r"ccutil\helpers.h",
|
|
r"ccutil\ndminx.h",
|
|
r"ccutil\params.h",
|
|
r"ccutil\unicharmap.h",
|
|
r"ccutil\unicharset.h",
|
|
}
|
|
|
|
genericVectorIncludeSet = {
|
|
r"ccutil\genericvector.h",
|
|
r"ccutil\tesscallback.h",
|
|
r"ccutil\errcode.h",
|
|
r"ccutil\host.h",
|
|
r"ccutil\helpers.h",
|
|
r"ccutil\ndminx.h",
|
|
}
|
|
|
|
blobsIncludeSet = {
|
|
r"ccstruct\blobs.h",
|
|
r"ccstruct\rect.h",
|
|
r"ccstruct\points.h",
|
|
r"ccstruct\ipoints.h",
|
|
r"ccutil\elst.h",
|
|
r"ccutil\host.h",
|
|
r"ccutil\serialis.h",
|
|
r"ccutil\lsterr.h",
|
|
r"ccutil\ndminx.h",
|
|
r"ccutil\tprintf.h",
|
|
r"ccutil\params.h",
|
|
r"viewer\scrollview.h",
|
|
r"ccstruct\vecfuncs.h",
|
|
}
|
|
|
|
extraFilesSet = {
|
|
#r"vs2008\include\stdint.h",
|
|
r"vs2008\include\leptonica_versionnumbers.vsprops",
|
|
r"vs2008\include\tesseract_versionnumbers.vsprops",
|
|
}
|
|
|
|
tessIncludeDir = os.path.join(includeDir, "tesseract")
|
|
if os.path.isfile(tessIncludeDir):
|
|
print 'Aborting: "%s" is a file not a directory.' % tessIncludeDir
|
|
return
|
|
if not os.path.exists(tessIncludeDir):
|
|
os.mkdir(tessIncludeDir)
|
|
|
|
#fileSet = baseIncludeSet | strngIncludeSet | genericVectorIncludeSet | blobsIncludeSet
|
|
fileSet = baseIncludeSet | strngIncludeSet | resultIteratorIncludeSet
|
|
|
|
copyIncludes(fileSet, "public", tessDir, tessIncludeDir)
|
|
copyIncludes(extraFilesSet, "extra", tessDir, includeDir)
|
|
|
|
# ====================================================================
|
|
|
|
def tessClean(tessDir):
|
|
'''Clean vs2008 folder of all build directories and certain temp files.'''
|
|
|
|
vs2008Dir = os.path.join(tessDir, "vs2008")
|
|
vs2008AbsDir = os.path.abspath(vs2008Dir)
|
|
|
|
answer = raw_input(
|
|
'Are you sure you want to clean the\n "%s" folder (Yes/No) [No]? ' %
|
|
vs2008AbsDir)
|
|
if answer.lower() not in ("yes",):
|
|
return
|
|
answer = raw_input('Only list the items to be deleted (Yes/No) [Yes]? ')
|
|
answer = answer.strip()
|
|
listOnly = answer.lower() not in ("no",)
|
|
|
|
for rootDir, dirs, files in os.walk(vs2008AbsDir):
|
|
for buildDir in ("LIB_Release", "LIB_Debug", "DLL_Release", "DLL_Debug"):
|
|
if buildDir in dirs:
|
|
dirs.remove(buildDir)
|
|
absBuildDir = os.path.join(rootDir, buildDir)
|
|
if listOnly:
|
|
print "Would remove: %s" % absBuildDir
|
|
else:
|
|
print "Removing: %s" % absBuildDir
|
|
shutil.rmtree(absBuildDir)
|
|
|
|
if rootDir == vs2008AbsDir:
|
|
for file in files:
|
|
if file.lower() not in ("tesseract.sln",
|
|
"tesshelper.py",
|
|
"readme.txt"):
|
|
absPath = os.path.join(rootDir, file)
|
|
if listOnly:
|
|
print "Would remove: %s" % absPath
|
|
else:
|
|
print "Removing: %s" % absPath
|
|
os.remove(absPath)
|
|
else:
|
|
for file in files:
|
|
root, ext = os.path.splitext(file)
|
|
if ext.lower() in (".suo",
|
|
".ncb",
|
|
".user",
|
|
) or (
|
|
len(ext)>0 and ext[-1] == "~"):
|
|
absPath = os.path.join(rootDir, file)
|
|
if listOnly:
|
|
print "Would remove: %s" % absPath
|
|
else:
|
|
print "Removing: %s" % absPath
|
|
os.remove(absPath)
|
|
|
|
# ====================================================================
|
|
|
|
def validateTessDir(tessDir):
|
|
"""Check that tessDir is a valid tesseract directory."""
|
|
|
|
if not os.path.isdir(tessDir):
|
|
raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % tessDir)
|
|
projFile = os.path.join(tessDir, PROJ_SUBDIR, PROJFILE)
|
|
if not os.path.isfile(projFile):
|
|
raise argparse.ArgumentTypeError('Project file "%s" doesn\'t exist.' % projFile)
|
|
return tessDir
|
|
|
|
def validateDir(dir):
|
|
"""Check that dir is a valid directory named include."""
|
|
|
|
if not os.path.isdir(dir):
|
|
raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % dir)
|
|
|
|
dirpath = os.path.abspath(dir)
|
|
head, tail = os.path.split(dirpath)
|
|
if tail.lower() != "include":
|
|
raise argparse.ArgumentTypeError('Include directory "%s" must be named "include".' % tail)
|
|
|
|
return dir
|
|
|
|
def main ():
|
|
parser = argparse.ArgumentParser(
|
|
epilog=epilogStr,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
|
|
parser.add_argument("--version", action="version",
|
|
version="%(prog)s " + VERSION)
|
|
parser.add_argument('tessDir', type=validateTessDir,
|
|
help="tesseract installation directory")
|
|
|
|
subparsers = parser.add_subparsers(
|
|
dest="subparser_name",
|
|
title="Commands")
|
|
parser_changes = subparsers.add_parser('compare',
|
|
help="compare libtesseract Project with tessDir")
|
|
parser_changes.set_defaults(func=tessCompare)
|
|
|
|
parser_report = subparsers.add_parser('report',
|
|
help="report libtesseract summary stats")
|
|
parser_report.set_defaults(func=tessReport)
|
|
|
|
parser_copy = subparsers.add_parser('copy',
|
|
help="copy public libtesseract header files to includeDir")
|
|
parser_copy.add_argument('includeDir', type=validateDir,
|
|
help="Directory to copy header files to.")
|
|
parser_copy.set_defaults(func=tessCopy)
|
|
|
|
parser_clean = subparsers.add_parser('clean',
|
|
help="clean vs2008 folder of build folders and .user files")
|
|
parser_clean.set_defaults(func=tessClean)
|
|
|
|
#kludge because argparse has no ability to set default subparser
|
|
if (len(sys.argv) == 2):
|
|
sys.argv.append("compare")
|
|
args = parser.parse_args()
|
|
|
|
#handle commands
|
|
if args.func == tessCopy:
|
|
args.func(args.tessDir, args.includeDir)
|
|
else:
|
|
args.func(args.tessDir)
|
|
|
|
if __name__ == '__main__' :
|
|
main()
|