__author__ = 'Navid Rekabsaz'
__date__="11.06.2016"


import os
from os import listdir
from os.path import isfile, join
import re
import traceback
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from multiprocessing import Pool
import pdb
import codecs

from common.src.textprocessing.preprocessing import Preprocessing


class KeyboardInterruptError(Exception): pass


def _mkdir_recursive(path):
    sub_path = os.path.dirname(path)
    if not os.path.exists(sub_path):
        _mkdir_recursive(sub_path)
    if not os.path.exists(path):
        os.mkdir(path)


def isequal(tokens, effectivetokens1, effectivetokens2):
    found1=0
    for effectivetoken in effectivetokens1:
        if effectivetoken in tokens:
            found1+=1
    found2=0
    for effectivetoken in effectivetokens2:
        if effectivetoken in tokens:
            found2+=1

    if float(len(tokens))*0.8<found1+found2 and ((len(tokens)>float(len(effectivetokens1)+len(effectivetokens2))*0.5) or (float(len(effectivetokens1)+len(effectivetokens2))*0.8<found1+found2) or (len(effectivetokens1)==found1)):
        #print tokens, found1, found2, len(tokens), len(effectivetokens1), len(effectivetokens2)
        return True
    else:
        return False

def find_lineids(lines):
    lineids={}
    lineids['1a']=[]
    lineids['1b']=[]
    lineids['2']=[]
    lineids['7']=[]
    lineids['7a']=[]
    lineids['8']=[]

    for line_i, line in enumerate(lines):
        if len(line)>200:
            continue
        
        line=line.strip('\n').lower()
        tokens = tokenizer.tokenize(line)
        tokens = [token for token in tokens if token not in stops and (len(token)>1 or token.isdigit())]
        for token_i, token in enumerate(tokens):
            if token[-1]=='s':
                tokens[token_i]=token[:-1]
                
        #print line_i, line
        
        #item 1a. risk factors
        effectivetokens1=['risk','factor']
        effectivetokens2=['item','1a']
        if isequal(tokens, effectivetokens1, effectivetokens2):
            lineids['1a'].append(line_i)


        #item 1b. unresolved staff comments
        effectivetokens1=['unresolved','staff','comment']
        effectivetokens2=['item','1b']
        if isequal(tokens, effectivetokens1, effectivetokens2):
            lineids['1b'].append(line_i)

        #item 2. properties
        effectivetokens1=['propoertie']
        effectivetokens2=['item','2']
        if isequal(tokens, effectivetokens1, effectivetokens2):
            lineids['2'].append(line_i)


        #item 7
        #effectivetokens1=['management','discussion','analysi']
        #effectivetokens2=['item','7','financial','condition','result','operation','operating']
        #if isequal(tokens, effectivetokens1, effectivetokens2):
        #    lineids['7'].append(line_i)

        #item 7a
        #effectivetokens1=['quantitative','qualitative','disclosure','market','risk']
        #effectivetokens2=['item','7a','qualification']
        #if isequal(tokens, effectivetokens1, effectivetokens2):
        #    lineids['7a'].append(line_i)


        #item 8
        #effectivetokens1=['financial','statement','supplementary','data']
        #effectivetokens2=['item','8','consolidated']
        #if isequal(tokens, effectivetokens1, effectivetokens2):
        #    lineids['8'].append(line_i)
            
    return lineids

def goextract(year):
    try:
        mypath='/data/nrekabsaz/data/financialrisk/10k/reports/'+str(year)+'.clean/'
        allfiles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]

        for filepath_i, filepath in enumerate(allfiles):
            
            with codecs.open(filepath,'r',encoding='utf8') as f:
                lines=f.readlines()


            lineids=find_lineids(lines)

            lineids1a=lineids['1a']
            lineids1b=lineids['1b']
            lineids1b.extend(lineids['2'])

            if len(lineids['1a'])!=0 and len(lineids['1b'])!=0:
                
                lineids1a.sort(reverse=True)
                lineids1b.sort(reverse=True)

                candidate1a=0
                candidate1b=0
                for l1a in lineids1a:
                    if candidate1a!=0 and candidate1b!=0:
                        break

                    for l1b in lineids1b:
                        if l1a<l1b:
                            candidate1b=l1b
                            candidate1a=l1a
                            break
                            
                if candidate1b!=0 and candidate1a!=0:
                    text=''.join(lines[candidate1a:candidate1b-1])

                    if len(text)>250:
                        targetpath=filepath.replace('.clean','.risk')
                        _mkdir_recursive(os.path.dirname(targetpath))

                        with codecs.open(targetpath,'w',encoding='utf8') as f:
                            f.write(text)
                    
            #if len(lineids['7'])!=0 and len(lineids['8'])!=0:
            #    lineids7=lineids['7']
            #    lineids8=lineids['8']
            #    lineids8.extend(lineids['7a'])

            #    lineids8.sort(reverse=True)
            #    lineids7.sort(reverse=True)

            #    candidate7=0
            #    candidate8=0
            #    for l7 in lineids7:
            #        if candidate8!=0 and candidate7!=0:
            #            break

            #        for l8 in lineids8:
            #            if l7>l8:
            #                candidate7=l7
            #                break
            #            elif candidate8==0:
            #                candidate8=l8


            #    if candidate7!=0 and candidate8!=0:
            #        targetpath=filepath.replace('.clean','.mda')
            #        _mkdir_recursive(os.path.dirname(targetpath))

            #        text=''.join(lines[candidate7:candidate8-1])

            #        fw=open(targetpath,'w')
            #        fw.write(text)
            #        fw.close()

            if filepath_i%10==0:
                print year, filepath_i, '/', len(allfiles)
                #break
        #break
    except KeyboardInterrupt:
        raise KeyboardInterruptError()
    except Exception as e:
        msgText=traceback.format_exc()
        print msgText
    
if __name__ == "__main__":
    tokenizer = RegexpTokenizer(r'\w+')
    stops = stopwords.words('english')
    years=range(1996,2008)

    p = Pool(10)
    try:
        print 'starting the pool map'
        p.map(goextract, years)
        #goextract(1996)
        p.close()
        print 'pool map complete'
        result=True
    except KeyboardInterrupt:
        print 'got ^C while pool mapping, terminating the pool'
        p.terminate()
        print 'pool is terminated'
    except Exception, e:
        print 'got exception: %r, terminating the pool' % (e,)
        p.terminate()
        print 'pool is terminated'
    finally:
        print 'joining pool processes'
        p.join()
        print 'join complete'