__author__ = 'Navid Rekabsaz' __date__="11.06.2016" import os from os import listdir from os.path import isfile, join import re import traceback from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords from multiprocessing import Pool import pdb import codecs from common.src.textprocessing.preprocessing import Preprocessing class KeyboardInterruptError(Exception): pass def _mkdir_recursive(path): sub_path = os.path.dirname(path) if not os.path.exists(sub_path): _mkdir_recursive(sub_path) if not os.path.exists(path): os.mkdir(path) def isequal(tokens, effectivetokens1, effectivetokens2): found1=0 for effectivetoken in effectivetokens1: if effectivetoken in tokens: found1+=1 found2=0 for effectivetoken in effectivetokens2: if effectivetoken in tokens: found2+=1 if float(len(tokens))*0.8float(len(effectivetokens1)+len(effectivetokens2))*0.5) or (float(len(effectivetokens1)+len(effectivetokens2))*0.8200: continue line=line.strip('\n').lower() tokens = tokenizer.tokenize(line) tokens = [token for token in tokens if token not in stops and (len(token)>1 or token.isdigit())] for token_i, token in enumerate(tokens): if token[-1]=='s': tokens[token_i]=token[:-1] #print line_i, line #item 1a. risk factors effectivetokens1=['risk','factor'] effectivetokens2=['item','1a'] if isequal(tokens, effectivetokens1, effectivetokens2): lineids['1a'].append(line_i) #item 1b. unresolved staff comments effectivetokens1=['unresolved','staff','comment'] effectivetokens2=['item','1b'] if isequal(tokens, effectivetokens1, effectivetokens2): lineids['1b'].append(line_i) #item 2. properties effectivetokens1=['propoertie'] effectivetokens2=['item','2'] if isequal(tokens, effectivetokens1, effectivetokens2): lineids['2'].append(line_i) #item 7 #effectivetokens1=['management','discussion','analysi'] #effectivetokens2=['item','7','financial','condition','result','operation','operating'] #if isequal(tokens, effectivetokens1, effectivetokens2): # lineids['7'].append(line_i) #item 7a #effectivetokens1=['quantitative','qualitative','disclosure','market','risk'] #effectivetokens2=['item','7a','qualification'] #if isequal(tokens, effectivetokens1, effectivetokens2): # lineids['7a'].append(line_i) #item 8 #effectivetokens1=['financial','statement','supplementary','data'] #effectivetokens2=['item','8','consolidated'] #if isequal(tokens, effectivetokens1, effectivetokens2): # lineids['8'].append(line_i) return lineids def goextract(year): try: mypath='/data/nrekabsaz/data/financialrisk/10k/reports/'+str(year)+'.clean/' allfiles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))] for filepath_i, filepath in enumerate(allfiles): with codecs.open(filepath,'r',encoding='utf8') as f: lines=f.readlines() lineids=find_lineids(lines) lineids1a=lineids['1a'] lineids1b=lineids['1b'] lineids1b.extend(lineids['2']) if len(lineids['1a'])!=0 and len(lineids['1b'])!=0: lineids1a.sort(reverse=True) lineids1b.sort(reverse=True) candidate1a=0 candidate1b=0 for l1a in lineids1a: if candidate1a!=0 and candidate1b!=0: break for l1b in lineids1b: if l1a250: targetpath=filepath.replace('.clean','.risk') _mkdir_recursive(os.path.dirname(targetpath)) with codecs.open(targetpath,'w',encoding='utf8') as f: f.write(text) #if len(lineids['7'])!=0 and len(lineids['8'])!=0: # lineids7=lineids['7'] # lineids8=lineids['8'] # lineids8.extend(lineids['7a']) # lineids8.sort(reverse=True) # lineids7.sort(reverse=True) # candidate7=0 # candidate8=0 # for l7 in lineids7: # if candidate8!=0 and candidate7!=0: # break # for l8 in lineids8: # if l7>l8: # candidate7=l7 # break # elif candidate8==0: # candidate8=l8 # if candidate7!=0 and candidate8!=0: # targetpath=filepath.replace('.clean','.mda') # _mkdir_recursive(os.path.dirname(targetpath)) # text=''.join(lines[candidate7:candidate8-1]) # fw=open(targetpath,'w') # fw.write(text) # fw.close() if filepath_i%10==0: print year, filepath_i, '/', len(allfiles) #break #break except KeyboardInterrupt: raise KeyboardInterruptError() except Exception as e: msgText=traceback.format_exc() print msgText if __name__ == "__main__": tokenizer = RegexpTokenizer(r'\w+') stops = stopwords.words('english') years=range(1996,2008) p = Pool(10) try: print 'starting the pool map' p.map(goextract, years) #goextract(1996) p.close() print 'pool map complete' result=True except KeyboardInterrupt: print 'got ^C while pool mapping, terminating the pool' p.terminate() print 'pool is terminated' except Exception, e: print 'got exception: %r, terminating the pool' % (e,) p.terminate() print 'pool is terminated' finally: print 'joining pool processes' p.join() print 'join complete'