Skip to content

Instantly share code, notes, and snippets.

@wolframalpha
Last active March 24, 2017 07:00
Show Gist options
  • Save wolframalpha/6ec9e5603368339e1cc715c1bb07f593 to your computer and use it in GitHub Desktop.
Save wolframalpha/6ec9e5603368339e1cc715c1bb07f593 to your computer and use it in GitHub Desktop.
Please extract the NP using this !
# coding: utf-8
import pickle
import re
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np
import pandas as pd
from nltk.tag import pos_tag, pos_tag_sents
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from scipy import sparse
from scipy.sparse import hstack
class ExperienceTagger(object):
def __init__(self, filepath=None, modelpath=None):
if filepath:
df = pd.read_csv(filepath)
df = df.ix[:, [0, 1]]
df = df.loc[~pd.isnull(df.WORD)]
df = df.loc[~pd.isnull(df.LABEL)]
self.tfid_word = TfidfVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b')
self.tfid_pos = TfidfVectorizer(ngram_range=(1, 3))
X = self.transform_text_into_feature_vector(texts=df.WORD)
y = df.LABEL
self.clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
self.clf.fit(X, y)
with open('experiencetaggermodel.bin', 'wb') as f:
pickle.dump([self.clf, self.tfid_pos, self.tfid_word], f)
elif modelpath:
with open(modelpath, 'rb') as f:
self.clf, self.tfid_pos, self.tfid_word = pickle.load(f)
def get_features(self, element, return_dictionary=False):
threshold_short = 3
threshold_long = 5
# print element
element = element.decode('utf-8')
feature_labels = ['size.short', 'size.medium', 'size.long', 'number.of.digits',
'size.words', 'no.of.NN', 'no.of.NNP', 'no.of.NNS']
features = {label: 0 for label in feature_labels}
# if element.find('%') > -1:
# features.update({'contains.percentage.symbol': 1})
splitted_words = re.split('[^\w\.%]+', element)
if len(splitted_words) < threshold_short:
features.update({'size.short': 1})
elif len(splitted_words) > threshold_long:
features.update({'size.long': 1})
else:
features.update({'size.medium': 1})
# if re.search('\d+\.\d+', element):
# features.update({'number.is.decimal': 1})
features.update({'number.of.digits': len(re.findall('\d', element))})
# features.update({'contains.hyphens': element.count('-')})
# if re.search('\\|\/', element):
# features.update({'contains.slash': 1})
nn = [y for _, y in pos_tag(word_tokenize(element))]
features.update({'no.of.NN': nn.count('NN')})
features.update({'no.of.NNP': nn.count('NNP')})
features.update({'no.of.NNS': nn.count('NNS')})
# features.update({'size.words': len(re.split('\W+', element))})
if return_dictionary:
return features
return np.array([[x[1] for x in sorted(features.items(), key=lambda x: x[0])]])
def transform_text_into_feature_vector(self, texts, transform=True):
pos_tags = map(lambda x: ' '.join(map(lambda x: x[1], x)), pos_tag_sents(map(word_tokenize, texts)))
if transform:
X_tfid_features_pos = self.tfid_pos.fit_transform(pos_tags)
X_tfid_features_word = self.tfid_word.fit_transform(texts)
else:
X_tfid_features_pos = self.tfid_pos.transform(pos_tags)
X_tfid_features_word = self.tfid_word.transform(texts)
# print X_tfid_features[1]
X_literal_features = map(self.get_features, texts)
# print type(np.array(X_tfid_features))
X_literal_features = np.concatenate(X_literal_features, axis=0)
# print [X_tfid_features.shape, X_literal_features.shape]
return np.concatenate([X_tfid_features_word.toarray(), X_literal_features, X_tfid_features_pos.toarray()],
axis=1)
def predict(self, text):
return self.clf.predict(self.transform_text_into_feature_vector([text], transform=False))[0]
class ExperienceTagger2(object):
def __init__(self, filepath=None, modelpath=None):
if filepath:
df = self.convert_all_dataframe_from_csvs(filepath)
# df = pd.read_csv(filepath)
# df = df.ix[:, [0, 1]]
# df = df.loc[~pd.isnull(df.WORD)]
# df = df.loc[~pd.isnull(df.LABEL)]
self.tfid_word = TfidfVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b')
self.tfid_pos = TfidfVectorizer(ngram_range=(1, 3))
X = self.transform_text_into_feature_vector(texts=df.WORD)
y = df.LABEL
self.clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
self.clf.fit(X, y)
with open('experiencetaggermodel.bin', 'wb') as f:
pickle.dump([self.clf, self.tfid_pos, self.tfid_word], f)
elif modelpath:
with open(modelpath, 'rb') as f:
self.clf, self.tfid_pos, self.tfid_word = pickle.load(f)
def get_features(self, element, return_dictionary=False):
threshold_short = 3
threshold_long = 5
# print element
element = element.decode('utf-8')
feature_labels = ['size.short', 'size.medium', 'size.long', 'number.of.digits',
'size.words', 'no.of.NN', 'no.of.NNP', 'no.of.NNS']
features = {label: 0 for label in feature_labels}
# if element.find('%') > -1:
# features.update({'contains.percentage.symbol': 1})
splitted_words = re.split('[^\w\.%]+', element)
if len(splitted_words) < threshold_short:
features.update({'size.short': 1})
elif len(splitted_words) > threshold_long:
features.update({'size.long': 1})
else:
features.update({'size.medium': 1})
# if re.search('\d+\.\d+', element):
# features.update({'number.is.decimal': 1})
features.update({'number.of.digits': len(re.findall('\d', element))})
# features.update({'contains.hyphens': element.count('-')})
# if re.search('\\|\/', element):
# features.update({'contains.slash': 1})
nn = [y for _, y in pos_tag(word_tokenize(element))]
features.update({'no.of.NN': nn.count('NN')})
features.update({'no.of.NNP': nn.count('NNP')})
features.update({'no.of.NNS': nn.count('NNS')})
# features.update({'size.words': len(re.split('\W+', element))})
if return_dictionary:
return features
return np.array([[x[1] for x in sorted(features.items(), key=lambda x: x[0])]])
def transform_text_into_feature_vector(self, texts, transform=True):
pos_tags = map(lambda x: ' '.join(map(lambda x: x[1], x)), pos_tag_sents(map(word_tokenize, texts)))
if transform:
X_tfid_features_pos = self.tfid_pos.fit_transform(pos_tags)
X_tfid_features_word = self.tfid_word.fit_transform(texts)
else:
X_tfid_features_pos = self.tfid_pos.transform(pos_tags)
X_tfid_features_word = self.tfid_word.transform(texts)
# print X_tfid_features[1]
X_literal_features = map(self.get_features, texts)
# print type(np.array(X_tfid_features))
X_literal_features = np.concatenate(X_literal_features, axis=0)
# print [X_tfid_features.shape, X_literal_features.shape]
# print X_tfid_features_word.toarray(), type(X_tfid_features_word.toarray())
X_tfid_features = hstack((X_tfid_features_word, X_tfid_features_pos))
sp_literal_features = sparse.csr_matrix(X_literal_features)
X_features = hstack((X_tfid_features, sp_literal_features))
return X_features
# return np.concatenate([X_tfid_features_word.toarray(), X_literal_features, X_tfid_features_pos.toarray()],
# axis=1)
return np.concatenate([X_tfid_features.toarray(), X_literal_features],
axis=1)
def predict(self, text):
return self.clf.predict(self.transform_text_into_feature_vector([text], transform=False))[0]
def convert_all_dataframe_from_csvs(self, filepaths):
df_prev = None
if isinstance(filepaths, (str, unicode)):
filepaths = [filepaths]
for filepath in filepaths:
df = pd.read_csv(filepath)
df = df.ix[:, [0, 1]]
df = df.loc[~pd.isnull(df.WORD)]
df = df.loc[~pd.isnull(df.LABEL)]
# print df.shape
if type(df_prev) == pd.DataFrame:
df = pd.concat([df, df_prev])
df_prev = df
return df
class ExperienceTagger3(object):
def __init__(self, filepath=None, modelpath=None):
if filepath:
df = self.convert_all_dataframe_from_csvs(filepath)
# df = pd.read_csv(filepath)
# df = df.ix[:, [0, 1]]
# df = df.loc[~pd.isnull(df.WORD)]
# df = df.loc[~pd.isnull(df.LABEL)]
# self.tfid_word = TfidfVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b')
self.tfid_word = CountVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b')
# self.tfid_pos = TfidfVectorizer(ngram_range=(1, 2))
self.tfid_pos = CountVectorizer(ngram_range=(1, 2))
X = self.transform_text_into_feature_vector(texts=df.WORD)
y = df.LABEL
# self.clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=41)
self.clf = RandomForestClassifier(n_estimators=100, random_state=22)
self.clf.fit(X, y)
with open('experiencetaggermodel.bin', 'wb') as f:
pickle.dump([self.clf, self.tfid_pos, self.tfid_word], f)
elif modelpath:
with open(modelpath, 'rb') as f:
self.clf, self.tfid_pos, self.tfid_word = pickle.load(f)
def get_features(self, element, return_dictionary=False):
threshold_short = 3
threshold_long = 5
# print element
element = element.decode('utf-8')
feature_labels = ['size.short', 'size.medium', 'size.long', 'number.of.digits',
'size.words', 'no.of.NN', 'no.of.NNP', 'no.of.NNS']
features = {label: 0 for label in feature_labels}
# if element.find('%') > -1:
# features.update({'contains.percentage.symbol': 1})
splitted_words = re.split('[^\w\.%]+', element)
# if len(splitted_words) < threshold_short:
# features.update({'size.short': 1})
# elif len(splitted_words) > threshold_long:
# features.update({'size.long': 1})
# else:
# features.update({'size.medium': 1})
# if re.search('\d+\.\d+', element):
# features.update({'number.is.decimal': 1})
features.update({'number.of.digits': len(re.findall('\d', element))})
# features.update({'contains.hyphens': element.count('-')})
# if re.search('\\|\/', element):
# features.update({'contains.slash': 1})
nn = [y for _, y in pos_tag(word_tokenize(element))]
features.update({'no.of.NN': nn.count('NN')})
features.update({'no.of.NNP': nn.count('NNP')})
features.update({'no.of.NNS': nn.count('NNS')})
# features.update({'size.words': len(re.split('\W+', element))})
if return_dictionary:
return features
return np.array([[x[1] for x in sorted(features.items(), key=lambda x: x[0])]])
def transform_text_into_feature_vector(self, texts, transform=True):
pos_tags = map(lambda x: ' '.join(map(lambda x: x[1], x)), pos_tag_sents(map(word_tokenize, texts)))
if transform:
X_tfid_features_pos = self.tfid_pos.fit_transform(pos_tags)
X_tfid_features_word = self.tfid_word.fit_transform(texts)
else:
X_tfid_features_pos = self.tfid_pos.transform(pos_tags)
X_tfid_features_word = self.tfid_word.transform(texts)
# print X_tfid_features[1]
X_literal_features = map(self.get_features, texts)
# print type(np.array(X_tfid_features))
X_literal_features = np.concatenate(X_literal_features, axis=0)
# print [X_tfid_features.shape, X_literal_features.shape]
# print X_tfid_features_word.toarray(), type(X_tfid_features_word.toarray())
X_tfid_features = hstack((X_tfid_features_word, X_tfid_features_pos))
sp_literal_features = sparse.csr_matrix(X_literal_features)
X_features = hstack((X_tfid_features, sp_literal_features))
return X_features
return X_tfid_features.toarray()
# return np.concatenate([X_tfid_features_word.toarray(), X_literal_features, X_tfid_features_pos.toarray()],
# axis=1)
return np.concatenate([X_tfid_features.toarray(), X_literal_features],
axis=1)
def predict(self, text):
return self.clf.predict(self.transform_text_into_feature_vector([text], transform=False).toarray())[0]
def convert_all_dataframe_from_csvs(self, filepaths):
df_prev = None
if isinstance(filepaths, (str, unicode)):
filepaths = [filepaths]
for filepath in filepaths:
df = pd.read_csv(filepath)
df = df.ix[:, [0, 1]]
df = df.loc[~pd.isnull(df.WORD)]
df = df.loc[~pd.isnull(df.LABEL)]
# print df.shape
if type(df_prev) == pd.DataFrame:
df = pd.concat([df, df_prev])
df_prev = df
return df
if __name__ == '__main__':
# hc = ExperienceTagger2(#modelpath='experiencetaggermodel.bin')
# filepath='data_corpus/experience_training_data.csv') # (modelpath="data_corpus/headingclassifier.bin")
experience_tagger = ExperienceTagger3(
filepath = ['data_corpus/experience_training_data.csv', 'data_corpus/experience_training_data_others.csv'])
print experience_tagger.predict(u'Affine Analytics')
# coding: utf-8
import pandas as pd
from itertools import repeat
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import numpy as np
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
import pickle
import pdfminer
from pdfminer.layout import *
filepath = "data_corpus/heading_clustered filtered.csv"
class HeadingClassifier(object):
def __init__(self, filepath=None, modelpath=None):
self.filepath = filepath
self.modelpath = modelpath
if not modelpath and not filepath:
assert "filepath and modelpath both cannot be None!"
if filepath:
self._train_model()
elif modelpath:
self._load_model()
else:
assert 'Pass modelpath or filepath'
def _train_model(self, save_filepath='data_corpus/headingclassifier.bin'):
df = pd.read_csv(self.filepath)
n_df = pd.DataFrame([(label, name.strip(' ')) for name in df for label in df.ix[pd.notnull(df.ix[:, name]), name]], columns=['X', 'y'])
n_df_shuffled = n_df.iloc[np.random.permutation(len(n_df))]
heading_clf = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words=stopwords.words())),
('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])
# text_clf.fit(n_df_shuffled.X[: 200], n_df_shuffled.y[: 200])
# text_clf.score(n_df_shuffled.X[200: ], n_df_shuffled.y[200: ])
heading_clf.fit(n_df_shuffled.X, n_df_shuffled.y)
heading_clf.score(n_df_shuffled.X, n_df_shuffled.y)
self.heading_clf = heading_clf
pickle.dump(obj=self.heading_clf, file=open(save_filepath, 'wb'))
def _load_model(self):
self.heading_clf = pickle.load(open(self.modelpath, 'rb'))
def get_labels(self):
return list(self.heading_clf.classes_)
def heading_label(self, heading):
if isinstance(heading, LTTextLine):
heading = heading.get_text()
return self.heading_clf.predict([heading])[0]
def heading_labels(self, headings):
return list(self.heading_clf.predict(headings))
if __name__ == '__main__':
hc = HeadingClassifier(filepath=filepath)#(modelpath="data_corpus/headingclassifier.bin")
print hc.heading_label(u'PROFESSIONAL\xa0EXPERIENCE:\xa0\n')
print hc.heading_label('Scholastic Experience')
print hc.heading_label('ACADEMIC')
print list(hc.heading_clf.classes_)
import sys
import os
import time
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# from cStringIO import StringIO
from cStringIO import StringIO
def convert_pdf_to_text(infile, outfile=None):
assert infile.endswith('.pdf'), "Hola!! , Must be a .pdf file, at least it doesn't ends with one ;)"
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 1
# output option
outtype = None
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
print outfile
outfp = StringIO()
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
fp = file(infile, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
content = outfp.getvalue()
fp.close()
device.close()
outfp.close()
return content
def extract_layout_by_page(pdf_path):
laparams = LAParams()
fp = open(pdf_path, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
layouts = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layouts.append(device.get_result())
return layouts
example_file = "resumes_data/Amit.Nandi_CV.pdf"
example_file = "/home/wolfram/project@work/resume-intent-classification/resumes_data/20160912 TusharSrivastava_CV.pdf"
# example_file = "resumes_data/Adarsh Mohan.pdf"
# example_file = "resumes_data/Manish_Chandra_Gupta.pdf"
# example_file = "docs_converted_to_pdfs/Abhishek Mangla Resume.pdf"
# example_file = "docs_converted_to_pdfs/Abinash Adhikary CV 1.pdf"
# example_file = "docs_converted_to_pdfs/(498771424) GANESH SHANKAR JADHAWARnew.pdf"
# example_file = "Emails/Aakarsh Prasad_resume (final)_2.pdf"
# example_file = "resumes_data/ParulKumar.pdf"
# example_file = "resumes_data/rahul khalkar (purchase engineer).pdf"
page_layouts = extract_layout_by_page(example_file)
text = convert_pdf_to_text(example_file, None)
import itertools
import pdfminer
from SectionExtractor.SectionExtractor import SectionExtractor
def groupby_tolerance(iterable, key=None, tolerance=0, issorted=False):
# And again !
if not issorted:
iterable = sorted(iterable, key=key, reverse=True)
if not key:
key = lambda x: x
prev = None
tmp = []
group = []
for it in iterable:
if not prev:
prev = it
tmp.append(it)
continue
if abs(key(it) - key(prev)) <= tolerance:
tmp.append(it)
prev = it
else:
group.append(tmp)
tmp = [it]
prev = it
group.append(tmp)
return group
def is_textline_bold(textline):
pass
def get_bold_words(textline):
pass
def concatenate_sents(textlines):
map(lambda x: x.strip())
pass
def get_grouped_textlines(textlines, print_lines=False):
def split_text_lines(text):
return re.split('[\s\:\|\+]{5,}', text)
def if_starts_with_special_character(text):
if isinstance(text, pdfminer.layout.LTTextLine):
text = text.get_text()
# if re.match(u'\d{2,}', text.strip()):
# return None
# print text, 'SPECIAL CHARACTER', re.match(u'[^a-zA-Z]+$', text.strip())
return re.match(u'[^a-zA-Z]+', text.strip())
def if_special_character(text):
if isinstance(text, pdfminer.layout.LTTextLine):
text = text.get_text()
return re.match(u'^[^a-zA-Z]+$', text.strip())
def if_ends_with_fullstop(text):
if isinstance(text, pdfminer.layout.LTTextLine):
text = text.get_text()
return re.match(u'.*\.[\s]*$', text.strip(' '))
def sentences_from_subsection(textlines, ifreversed=False):
if not ifreversed:
sorted(textlines, key=lambda x: x.y1, reverse=True)
# TODO : Analyse the sentence structure - to determine - IF heading ?
def check_if_heading(lines, n_words, max_len_xcoor, threshold_x_distance=20):
if len(lines) == 1:
if len(nltk.word_tokenize(lines[0])) < n_words and lines[0].x1 < max_len_xcoor:
pass
return split_text_lines(lines[0]) > 1
lines = sorted(lines, key=x.x0)
diff = [line[idx+1].x0 - line[idx].x1 for idx, line in enumerate(lines)]
if any(filter(lambda x: x>20, diff)):
return True
def check_if_non_sentence_line(lines, n_letters=5):
# print lines, len(filter(lambda x: re.search('\w{%s,}' % n_letters, x.get_text()), lines))
if len(filter(lambda x: len(re.findall('\w', x.get_text()))>n_letters, lines)) >= 2:
return True
return False
# filter(lambda x: re.search('\w+', x.get_text()), textlines)
textlines = sorted(textlines, key=lambda x: x.y1, reverse=True)
# print textlines
same_line_textlines_grouped = map(lambda x: sorted(x, key=lambda y: y.x0), groupby_tolerance(textlines, key=lambda x: (x.y0 + x.y1) / 2, tolerance=2))
largest_line = max(textlines, key=lambda x: x.x1 - x.x0)
xtreme_xcoor = float(largest_line.x1 * 0.75)
grouped_textlines = []
temp = []
for line in same_line_textlines_grouped:
# if check_if_non_sentence_line(line):
# print line, '-------------------------------'
if check_if_non_sentence_line(line):
# print line, '------'
grouped_textlines.extend(map(lambda x: [x], line))
continue
if if_special_character(line[0]) or if_starts_with_special_character(line[0]) or check_if_non_sentence_line(line):
# print 'CONDITION: 1'
if temp:
grouped_textlines.append(temp)
temp = []
temp.extend(line)
if if_ends_with_fullstop(line[-1]) or line[-1].x1 <= xtreme_xcoor or check_if_non_sentence_line(line):
# print 'CONDITION: 2'
grouped_textlines.append(temp)
temp = []
if temp:
grouped_textlines.append(temp)
if print_lines:
for line in grouped_textlines:
print '----------'
# print line
print ' '.join(map(lambda x: x.get_text().strip(), line))
grouped_lines = [' '.join(map(lambda x: x.get_text().strip(), lines)) for lines in grouped_textlines]
return grouped_textlines, map(split_text_lines, grouped_lines)
# se = SectionExtractor('resumes_data/20160912 TusharSrivastava_CV.pdf')
se = SectionExtractor('resumes_data/Namrata Meena_IIT Kharagpur.pdf')
# se = SectionExtractor.SectionExtractor('resumes_data/Amit.Nandi_CV.pdf')
textlines = filter(lambda x: h_clf.heading_label(x[0].get_text()) == "EXPERIENCE", se.get_sections_from_resume(as_text=False).items())[0][1]
textlines = filter(lambda x: not re.match('^\s+$', x.get_text()), textlines)
print get_grouped_textlines(textlines, print_lines=True)
remove_special_character = lambda x: re.sub(r'[^\x00-\x7F]+',' ', x)
candidate = map(remove_special_character, filter(lambda x: len(word_tokenize(x))<10, sum(grouped_lines, [])))
filter(lambda x: x[0] != 'OTHERS' , zip(map(experience_tagger.predict, candidate), candidate))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment