#!/usr/bin/env python
from __future__ import print_function
from builtins import input
"""
tesshelper.py -- Utility operations to compare, report stats, and copy 
                 public headers for tesseract 3.0x VS2010 Project

$RCSfile: tesshelper.py,v $ $Revision: 7ca575b377aa $ $Date: 2012/03/07 17:26:31 $
"""

r"""
Requires:

 python 2.7 or greater: activestate.com
  http://www.activestate.com/activepython/downloads

because using the new argparse module and new literal set syntax (s={1, 2}) .

General Notes:
--------------

Format for a .vcproj file entry:

			<File
				RelativePath="..\src\allheaders.h"
				>
			</File>

"""

epilogStr = r"""
Examples:

Assume that tesshelper.py is in c:\buildfolder\tesseract-3.02\vs2010,
which is also the current directory. Then,

    python tesshelper .. compare
    
will compare c:\buildfolder\tesseract-3.02 "library" directories to the
libtesseract Project 
(c:\buildfolder\tesseract-3.02\vs2010\libtesseract\libtesseract.vcproj).

    python tesshelper ..  report
    
will display summary stats for c:\buildfolder\tesseract-3.02 "library" 
directories and the libtesseract Project.

    python tesshelper .. copy ..\..\include

will copy all "public" libtesseract header files to
c:\buildfolder\include.

    python tesshelper .. clean

will clean the vs2010 folder of all build directories, and .user, .suo,
.ncb, and other temp files.

"""

# imports of python standard library modules
# See Python Documentation | Library Reference for details
import collections
import glob
import argparse
import os
import re
import shutil
import sys

# ====================================================================

VERSION = "1.0 %s" % "$Date: 2012/03/07 17:26:31 $".split()[1]
PROJ_SUBDIR = r"vs2010\libtesseract"
PROJFILE = "libtesseract.vcproj"

NEWHEADERS_FILENAME = "newheaders.txt"
NEWSOURCES_FILENAME = "newsources.txt"

fileNodeTemplate = \
'''    <ClCompile Include="..\..\%s" />'''

# ====================================================================

def getProjectfiles(libTessDir, libProjectFile, nTrimChars):
    """Return sets of all, c, h, and resources files in libtesseract Project"""

    #extract filenames of header & source files from the .vcproj
    projectCFiles = set()
    projectHFiles = set()
    projectRFiles = set()
    projectFilesSet = set()
    f = open(libProjectFile, "r")
    data = f.read()
    f.close()

    projectFiles = re.findall(r'(?i)Include="(\.[^"]+)"', data)
    for projectFile in projectFiles:
        root, ext = os.path.splitext(projectFile.lower())
        if ext == ".c" or ext == ".cpp":
            projectCFiles.add(projectFile)
        elif ext == ".h":
            projectHFiles.add(projectFile)
        elif ext == ".rc":
            projectRFiles.add(projectFile)
        else:
            print("unknown file type: %s" % projectFile)
            
        relativePath = os.path.join(libTessDir, projectFile)
        relativePath = os.path.abspath(relativePath)
        relativePath = relativePath[nTrimChars:].lower()
        projectFilesSet.add(relativePath)
    
    return projectFilesSet, projectHFiles, projectCFiles, projectRFiles
    
def getTessLibFiles(tessDir, nTrimChars):
    """Return set of all libtesseract files in tessDir"""

    libDirs = [
        "api",
        "ccmain",
        "ccstruct",
        "ccutil",
        "classify",
        "cube",
        "cutil",
        "dict",
        r"neural_networks\runtime",
        "opencl",
        "textord",
        "viewer",
        "wordrec",
        #"training",
        r"vs2010\port",
        r"vs2010\libtesseract",
        ]

    #create list of all .h, .c, .cpp files in "library" directories
    tessFiles = set()
    for curDir in libDirs:
        baseDir = os.path.join(tessDir, curDir)
        for filetype in ["*.c", "*.cpp", "*.h"]:
            pattern = os.path.join(baseDir, filetype)
            fileList = glob.glob(pattern)
            for curFile in fileList:
                curFile = os.path.abspath(curFile)
                relativePath = curFile[nTrimChars:].lower()
                tessFiles.add(relativePath)

    return tessFiles

# ====================================================================

def tessCompare(tessDir):
    '''Compare libtesseract Project files and actual "sub-library" files.'''

    vs2010Dir = os.path.join(tessDir, "vs2010")
    libTessDir = os.path.join(vs2010Dir, "libtesseract")
    libProjectFile = os.path.join(libTessDir,"libtesseract.vcxproj")
    tessAbsDir = os.path.abspath(tessDir)
    nTrimChars = len(tessAbsDir)+1
    print('Comparing VS2010 Project "%s" with\n   "%s"' % (libProjectFile,
                                                           tessAbsDir))
    
    projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \
        getProjectfiles(libTessDir, libProjectFile, nTrimChars)
    tessFiles = getTessLibFiles(tessDir, nTrimChars)
 
    extraFiles = tessFiles - projectFilesSet
    print("%2d Extra files (in %s but not in Project)" % (len(extraFiles),
                                                          tessAbsDir))
    headerFiles = []
    sourceFiles = []
    sortedList = list(extraFiles)
    sortedList.sort()
    for filename in sortedList:
        root, ext = os.path.splitext(filename.lower())
        if ext == ".h":
            headerFiles.append(filename)
        else:
            sourceFiles.append(filename)
        print("   %s " % filename)

    print()
    print("%2d new header file items written to %s" % (len(headerFiles),
                                                    NEWHEADERS_FILENAME))
    headerFiles.sort()
    with open(NEWHEADERS_FILENAME, "w") as f:
        for filename in headerFiles:
            f.write(fileNodeTemplate % filename)

    print("%2d new source file items written to %s" % (len(sourceFiles),
                                                    NEWSOURCES_FILENAME))
    sourceFiles.sort()
    with open(NEWSOURCES_FILENAME, "w") as f:
        for filename in sourceFiles:
            f.write(fileNodeTemplate % filename)
    print()

    deadFiles = projectFilesSet - tessFiles
    print("%2d Dead files (in Project but not in %s" % (len(deadFiles),
                                                        tessAbsDir))
    sortedList = list(deadFiles)
    sortedList.sort()
    for filename in sortedList:
        print("   %s " % filename)
        
# ====================================================================

def tessReport(tessDir):
    """Report summary stats on "sub-library" files and libtesseract Project file."""

    vs2010Dir = os.path.join(tessDir, "vs2010")
    libTessDir = os.path.join(vs2010Dir, "libtesseract")
    libProjectFile = os.path.join(libTessDir,"libtesseract.vcproj")
    tessAbsDir = os.path.abspath(tessDir)
    nTrimChars = len(tessAbsDir)+1
    
    projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \
        getProjectfiles(libTessDir, libProjectFile, nTrimChars)
    tessFiles = getTessLibFiles(tessDir, nTrimChars)

    print('Summary stats for "%s" library directories' % tessAbsDir)
    folderCounters = {}
    for tessFile in tessFiles:
        tessFile = tessFile.lower()
        folder, head = os.path.split(tessFile)
        file, ext = os.path.splitext(head)
        typeCounter = folderCounters.setdefault(folder, collections.Counter())
        typeCounter[ext[1:]] += 1

    folders = list(folderCounters.keys())
    folders.sort()
    totalFiles = 0
    totalH = 0
    totalCPP = 0
    totalOther = 0

    print()
    print(" total  h  cpp")
    print(" ----- --- ---")
    for folder in folders:
        counters = folderCounters[folder]
        nHFiles = counters['h']
        nCPPFiles = counters['cpp']
        
        total = nHFiles + nCPPFiles
        totalFiles += total
        totalH += nHFiles
        totalCPP += nCPPFiles
        
        print(" %5d %3d %3d %s" % (total, nHFiles, nCPPFiles, folder))
    print(" ----- --- ---")
    print(" %5d %3d %3d" % (totalFiles, totalH, totalCPP))
    
    print()
    print('Summary stats for VS2010 Project "%s"' % libProjectFile)
    print(" %5d %s" %(len(projectHFiles), "Header files"))
    print(" %5d %s" % (len(projectCFiles), "Source files"))
    print(" %5d %s" % (len(projectRFiles), "Resource files"))
    print(" -----")
    print(" %5d" % (len(projectHFiles) + len(projectCFiles) + len(projectRFiles), ))
        
# ====================================================================

def copyIncludes(fileSet, description, tessDir, includeDir):
    """Copy set of files to specified include dir."""
    
    print()
    print('Copying libtesseract "%s" headers to %s' % (description, includeDir))
    print()

    sortedList = list(fileSet)
    sortedList.sort()
    
    count = 0
    errList = []
    for includeFile in sortedList:
        filepath = os.path.join(tessDir, includeFile)
        if os.path.isfile(filepath):
            shutil.copy2(filepath, includeDir)
            print("Copied: %s" % includeFile)
            count += 1
        else:
            print('***Error: "%s" doesn\'t exist"' % filepath)
            errList.append(filepath)
            
    print('%d header files successfully copied to "%s"' % (count, includeDir))
    if len(errList):
        print("The following %d files were not copied:")
        for filepath in errList:
            print("   %s" % filepath)
    
def tessCopy(tessDir, includeDir):
    '''Copy all "public" libtesseract Project header files to include directory.
    
    Preserves directory hierarchy.'''

    baseIncludeSet = {
        r"api\baseapi.h",
        r"api\capi.h",
        r"api\apitypes.h",
        r"ccstruct\publictypes.h",
        r"ccmain\thresholder.h",
        r"ccutil\host.h",
        r"ccutil\basedir.h",
        r"ccutil\tesscallback.h",
        r"ccutil\unichar.h",
        r"ccutil\platform.h",
        }

    strngIncludeSet = {
        r"ccutil\strngs.h",
        r"ccutil\memry.h",
        r"ccutil\host.h",
        r"ccutil\serialis.h",
        r"ccutil\errcode.h",
        r"ccutil\fileerr.h",
        #r"ccutil\genericvector.h",
        }

    resultIteratorIncludeSet = {
        r"ccmain\ltrresultiterator.h",
        r"ccmain\pageiterator.h",
        r"ccmain\resultiterator.h",
        r"ccutil\genericvector.h",
        r"ccutil\tesscallback.h",
        r"ccutil\errcode.h",
        r"ccutil\host.h",
        r"ccutil\helpers.h",
        r"ccutil\ndminx.h",
        r"ccutil\params.h",
        r"ccutil\unicharmap.h",
        r"ccutil\unicharset.h",
        }

    genericVectorIncludeSet = {
        r"ccutil\genericvector.h",
        r"ccutil\tesscallback.h",
        r"ccutil\errcode.h",
        r"ccutil\host.h",
        r"ccutil\helpers.h",
        r"ccutil\ndminx.h",
        }
    
    blobsIncludeSet = {
        r"ccstruct\blobs.h",
        r"ccstruct\rect.h",
        r"ccstruct\points.h",
        r"ccstruct\ipoints.h",
        r"ccutil\elst.h",
        r"ccutil\host.h",
        r"ccutil\serialis.h",
        r"ccutil\lsterr.h",
        r"ccutil\ndminx.h",
        r"ccutil\tprintf.h",
        r"ccutil\params.h",
        r"viewer\scrollview.h",
        r"ccstruct\vecfuncs.h",
        }

    extraFilesSet = {
        #r"vs2010\include\stdint.h",
        r"vs2010\include\leptonica_versionnumbers.vsprops",
        r"vs2010\include\tesseract_versionnumbers.vsprops",
        }
        
    tessIncludeDir = os.path.join(includeDir, "tesseract")
    if os.path.isfile(tessIncludeDir):
        print('Aborting: "%s" is a file not a directory.' % tessIncludeDir)
        return
    if not os.path.exists(tessIncludeDir):
        os.mkdir(tessIncludeDir)
    
    #fileSet = baseIncludeSet | strngIncludeSet | genericVectorIncludeSet | blobsIncludeSet
    fileSet = baseIncludeSet | strngIncludeSet | resultIteratorIncludeSet

    copyIncludes(fileSet, "public", tessDir, tessIncludeDir)
    copyIncludes(extraFilesSet, "extra", tessDir, includeDir)

# ====================================================================

def tessClean(tessDir):
    '''Clean vs2010 folder of all build directories and certain temp files.'''

    vs2010Dir = os.path.join(tessDir, "vs2010")
    vs2010AbsDir = os.path.abspath(vs2010Dir)

    answer = eval(input(
        'Are you sure you want to clean the\n "%s" folder (Yes/No) [No]? ' % 
        vs2010AbsDir))
    if answer.lower() not in ("yes",):
        return
    answer = eval(input('Only list the items to be deleted (Yes/No) [Yes]? '))
    answer = answer.strip()
    listOnly = answer.lower() not in ("no",)

    for rootDir, dirs, files in os.walk(vs2010AbsDir):
        for buildDir in ("LIB_Release", "LIB_Debug", "DLL_Release", "DLL_Debug"):
            if buildDir in dirs:
                dirs.remove(buildDir)
                absBuildDir = os.path.join(rootDir, buildDir)
                if listOnly:
                    print("Would remove: %s" % absBuildDir)
                else:
                    print("Removing: %s" % absBuildDir)
                    shutil.rmtree(absBuildDir)

        if rootDir == vs2010AbsDir:
            for file in files:
                if file.lower() not in ("tesseract.sln",
                                        "tesshelper.py",
                                        "readme.txt"):
                    absPath = os.path.join(rootDir, file)
                    if listOnly:
                        print("Would remove: %s" % absPath)
                    else:
                        print("Removing: %s" % absPath)
                        os.remove(absPath)
        else:
            for file in files:
                root, ext = os.path.splitext(file)
                if ext.lower() in (".suo",
                                   ".ncb",
                                   ".user",
                                   ) or (
                    len(ext)>0 and ext[-1] == "~"):
                    absPath = os.path.join(rootDir, file)
                    if listOnly:
                        print("Would remove: %s" % absPath)
                    else:
                        print("Removing: %s" % absPath)
                        os.remove(absPath)

# ====================================================================

def validateTessDir(tessDir):
    """Check that tessDir is a valid tesseract directory."""
    
    if not os.path.isdir(tessDir):
        raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % tessDir)
    projFile = os.path.join(tessDir, PROJ_SUBDIR, PROJFILE)
    if not os.path.isfile(projFile):
        raise argparse.ArgumentTypeError('Project file "%s" doesn\'t exist.' % projFile)
    return tessDir

def validateDir(dir):
    """Check that dir is a valid directory named include."""
    
    if not os.path.isdir(dir):
        raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % dir)
    
    dirpath = os.path.abspath(dir)
    head, tail = os.path.split(dirpath)
    if tail.lower() != "include":
        raise argparse.ArgumentTypeError('Include directory "%s" must be named "include".' % tail)
        
    return dir

def main ():
    parser = argparse.ArgumentParser(
        epilog=epilogStr,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    
    parser.add_argument("--version", action="version",
                        version="%(prog)s " + VERSION)
    parser.add_argument('tessDir', type=validateTessDir,
                        help="tesseract installation directory")

    subparsers = parser.add_subparsers(
        dest="subparser_name",
        title="Commands")
    parser_changes = subparsers.add_parser('compare',
                                           help="compare libtesseract Project with tessDir")
    parser_changes.set_defaults(func=tessCompare)
    
    parser_report = subparsers.add_parser('report',
                                          help="report libtesseract summary stats")
    parser_report.set_defaults(func=tessReport)

    parser_copy = subparsers.add_parser('copy',
                                        help="copy public libtesseract header files to includeDir")
    parser_copy.add_argument('includeDir', type=validateDir,
                             help="Directory to copy header files to.")
    parser_copy.set_defaults(func=tessCopy)

    parser_clean = subparsers.add_parser('clean',
        help="clean vs2010 folder of build folders and .user files")
    parser_clean.set_defaults(func=tessClean)

    #kludge because argparse has no ability to set default subparser
    if (len(sys.argv) == 2):
        sys.argv.append("compare")
    args = parser.parse_args()

    #handle commands
    if args.func == tessCopy:
        args.func(args.tessDir, args.includeDir)
    else:
        args.func(args.tessDir)

if __name__ == '__main__' :
    main()