###########################################################################
# TextIndexNG V 3                
# The next generation TextIndex for Zope
#
# This software is governed by a license. See
# LICENSE.txt for the terms of this license.
###########################################################################

"""
Interactive indexer shell

$Id: shell.py 1561 2006-06-03 10:51:57Z ajung $
"""

import sys, os, time, atexit
import hotshot, hotshot.stats
from optparse import OptionParser

from zope.app.tests import placelesssetup, ztapi
from zope.component.interfaces import IFactory

from index import Index

from stopwords import Stopwords
from splitter import SplitterFactory
from parsers.english import EnglishParser
from textindexng.lexicon import LexiconFactory
from textindexng.storage import StorageWithTermFrequencyFactory
from textindexng.thesaurus import GermanThesaurus
from textindexng.interfaces import IParser, IStopwords, IThesaurus


# Setup environment
placelesssetup.setUp()
ztapi.provideUtility(IFactory, SplitterFactory, 'txng.splitters.default')
ztapi.provideUtility(IParser, EnglishParser() , 'txng.parsers.en')
ztapi.provideUtility(IStopwords, Stopwords(), 'txng.stopwords')
ztapi.provideUtility(IFactory, LexiconFactory, 'txng.lexicons.default')
ztapi.provideUtility(IFactory, StorageWithTermFrequencyFactory, 'txng.storages.default')
ztapi.provideUtility(IThesaurus, GermanThesaurus, 'txng.thesaurus.de')


try:
    import readline
    histfile = os.path.expanduser('~/.pyhist')
    readline.read_history_file(histfile)
    atexit.register(readline.write_history_file, histfile)
except: pass


class Text:
    def __init__(self, s):
        self.SearchableText = s


parser = OptionParser()
parser.add_option('-d','--directory', action='store',type='string', default='tests/data/texts',
        dest='directory',help='directory to be search for input files')
parser.add_option('-p','--profile', action='store_true', default=False,
        dest='profile',help='perform profiling of the indexing process')
parser.add_option('-t','--thesaurus', action='store', default=None,
        dest='thesaurus',help='ID of thesaurus to be used')


options, files = parser.parse_args()

I = Index(fields=('SearchableText',), autoexpand_limit=4)

ts = time.time()
count = 0
bytes = 0  

ID2FILES = {}

def do_index(options, files):
    global count, bytes

    if not files:
        print >>sys.stderr, 'Reading files from %s' % options.directory
        files = [] 
        for dirname, dirs, filenames in os.walk(options.directory):
            for f in filenames:
                fullname = os.path.join(dirname, f)
                if f.endswith('txt'):
                    files.append(fullname)

    for docid, fname in enumerate(files):

        text = open(fname).read()
        I.index_object(Text(unicode(text, 'iso-8859-15')), docid)    
        count += 1
        bytes += len(text)
        ID2FILES[docid] = fname
        if count % 100 ==0:
            print count



if options.profile:
    prof = hotshot.Profile('indexer.prof')
    prof.runcall(do_index, options, files)

    stats = hotshot.stats.load('indexer.prof')
    stats.strip_dirs()
    stats.sort_stats('cumulative', 'calls')
    stats.print_stats(25)
else:
    do_index(options, files)

duration = time.time() - ts
print '%d documents, duration: %5.3f seconds,total size: %d bytes, speed: %5.3f bytes/second' % (count, duration, bytes, float(bytes)/duration)
    
while 1:
    query = raw_input('query> ')
    query = unicode(query, 'iso-8859-15')
    try:
        kw = {'autoexpand' : 'off',
              'ranking' : True,
              'ranking_maxhits' : 100,
              'field' : 'SearchableText',
             }
        if options.thesaurus:
            kw['thesaurus'] = options.thesaurus

        ts = time.time()

        if options.profile:
            prof = hotshot.Profile('query.prof')
            result = prof.runcall(I.search, query, **kw)
            stats = hotshot.stats.load('query.prof')
            stats.strip_dirs()
            stats.sort_stats('cumulative', 'calls')
            stats.print_stats(25)
        else:
            result = I.search(query, **kw)
        te = time.time()
        for docid,score in result.getRankedResults().items():
            print ID2FILES[docid], score
        print '%2.5lf milli-seconds' % (1000.0*(te-ts))
    except:
        import traceback
        traceback.print_exc()
