Last active
March 24, 2017 07:00
-
-
Save wolframalpha/6ec9e5603368339e1cc715c1bb07f593 to your computer and use it in GitHub Desktop.
Please extract the NP using this !
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import pickle | |
import re | |
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
import numpy as np | |
import pandas as pd | |
from nltk.tag import pos_tag, pos_tag_sents | |
from nltk.tokenize import word_tokenize | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from sklearn.linear_model import SGDClassifier | |
from scipy import sparse | |
from scipy.sparse import hstack | |
class ExperienceTagger(object): | |
def __init__(self, filepath=None, modelpath=None): | |
if filepath: | |
df = pd.read_csv(filepath) | |
df = df.ix[:, [0, 1]] | |
df = df.loc[~pd.isnull(df.WORD)] | |
df = df.loc[~pd.isnull(df.LABEL)] | |
self.tfid_word = TfidfVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b') | |
self.tfid_pos = TfidfVectorizer(ngram_range=(1, 3)) | |
X = self.transform_text_into_feature_vector(texts=df.WORD) | |
y = df.LABEL | |
self.clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) | |
self.clf.fit(X, y) | |
with open('experiencetaggermodel.bin', 'wb') as f: | |
pickle.dump([self.clf, self.tfid_pos, self.tfid_word], f) | |
elif modelpath: | |
with open(modelpath, 'rb') as f: | |
self.clf, self.tfid_pos, self.tfid_word = pickle.load(f) | |
def get_features(self, element, return_dictionary=False): | |
threshold_short = 3 | |
threshold_long = 5 | |
# print element | |
element = element.decode('utf-8') | |
feature_labels = ['size.short', 'size.medium', 'size.long', 'number.of.digits', | |
'size.words', 'no.of.NN', 'no.of.NNP', 'no.of.NNS'] | |
features = {label: 0 for label in feature_labels} | |
# if element.find('%') > -1: | |
# features.update({'contains.percentage.symbol': 1}) | |
splitted_words = re.split('[^\w\.%]+', element) | |
if len(splitted_words) < threshold_short: | |
features.update({'size.short': 1}) | |
elif len(splitted_words) > threshold_long: | |
features.update({'size.long': 1}) | |
else: | |
features.update({'size.medium': 1}) | |
# if re.search('\d+\.\d+', element): | |
# features.update({'number.is.decimal': 1}) | |
features.update({'number.of.digits': len(re.findall('\d', element))}) | |
# features.update({'contains.hyphens': element.count('-')}) | |
# if re.search('\\|\/', element): | |
# features.update({'contains.slash': 1}) | |
nn = [y for _, y in pos_tag(word_tokenize(element))] | |
features.update({'no.of.NN': nn.count('NN')}) | |
features.update({'no.of.NNP': nn.count('NNP')}) | |
features.update({'no.of.NNS': nn.count('NNS')}) | |
# features.update({'size.words': len(re.split('\W+', element))}) | |
if return_dictionary: | |
return features | |
return np.array([[x[1] for x in sorted(features.items(), key=lambda x: x[0])]]) | |
def transform_text_into_feature_vector(self, texts, transform=True): | |
pos_tags = map(lambda x: ' '.join(map(lambda x: x[1], x)), pos_tag_sents(map(word_tokenize, texts))) | |
if transform: | |
X_tfid_features_pos = self.tfid_pos.fit_transform(pos_tags) | |
X_tfid_features_word = self.tfid_word.fit_transform(texts) | |
else: | |
X_tfid_features_pos = self.tfid_pos.transform(pos_tags) | |
X_tfid_features_word = self.tfid_word.transform(texts) | |
# print X_tfid_features[1] | |
X_literal_features = map(self.get_features, texts) | |
# print type(np.array(X_tfid_features)) | |
X_literal_features = np.concatenate(X_literal_features, axis=0) | |
# print [X_tfid_features.shape, X_literal_features.shape] | |
return np.concatenate([X_tfid_features_word.toarray(), X_literal_features, X_tfid_features_pos.toarray()], | |
axis=1) | |
def predict(self, text): | |
return self.clf.predict(self.transform_text_into_feature_vector([text], transform=False))[0] | |
class ExperienceTagger2(object): | |
def __init__(self, filepath=None, modelpath=None): | |
if filepath: | |
df = self.convert_all_dataframe_from_csvs(filepath) | |
# df = pd.read_csv(filepath) | |
# df = df.ix[:, [0, 1]] | |
# df = df.loc[~pd.isnull(df.WORD)] | |
# df = df.loc[~pd.isnull(df.LABEL)] | |
self.tfid_word = TfidfVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b') | |
self.tfid_pos = TfidfVectorizer(ngram_range=(1, 3)) | |
X = self.transform_text_into_feature_vector(texts=df.WORD) | |
y = df.LABEL | |
self.clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) | |
self.clf.fit(X, y) | |
with open('experiencetaggermodel.bin', 'wb') as f: | |
pickle.dump([self.clf, self.tfid_pos, self.tfid_word], f) | |
elif modelpath: | |
with open(modelpath, 'rb') as f: | |
self.clf, self.tfid_pos, self.tfid_word = pickle.load(f) | |
def get_features(self, element, return_dictionary=False): | |
threshold_short = 3 | |
threshold_long = 5 | |
# print element | |
element = element.decode('utf-8') | |
feature_labels = ['size.short', 'size.medium', 'size.long', 'number.of.digits', | |
'size.words', 'no.of.NN', 'no.of.NNP', 'no.of.NNS'] | |
features = {label: 0 for label in feature_labels} | |
# if element.find('%') > -1: | |
# features.update({'contains.percentage.symbol': 1}) | |
splitted_words = re.split('[^\w\.%]+', element) | |
if len(splitted_words) < threshold_short: | |
features.update({'size.short': 1}) | |
elif len(splitted_words) > threshold_long: | |
features.update({'size.long': 1}) | |
else: | |
features.update({'size.medium': 1}) | |
# if re.search('\d+\.\d+', element): | |
# features.update({'number.is.decimal': 1}) | |
features.update({'number.of.digits': len(re.findall('\d', element))}) | |
# features.update({'contains.hyphens': element.count('-')}) | |
# if re.search('\\|\/', element): | |
# features.update({'contains.slash': 1}) | |
nn = [y for _, y in pos_tag(word_tokenize(element))] | |
features.update({'no.of.NN': nn.count('NN')}) | |
features.update({'no.of.NNP': nn.count('NNP')}) | |
features.update({'no.of.NNS': nn.count('NNS')}) | |
# features.update({'size.words': len(re.split('\W+', element))}) | |
if return_dictionary: | |
return features | |
return np.array([[x[1] for x in sorted(features.items(), key=lambda x: x[0])]]) | |
def transform_text_into_feature_vector(self, texts, transform=True): | |
pos_tags = map(lambda x: ' '.join(map(lambda x: x[1], x)), pos_tag_sents(map(word_tokenize, texts))) | |
if transform: | |
X_tfid_features_pos = self.tfid_pos.fit_transform(pos_tags) | |
X_tfid_features_word = self.tfid_word.fit_transform(texts) | |
else: | |
X_tfid_features_pos = self.tfid_pos.transform(pos_tags) | |
X_tfid_features_word = self.tfid_word.transform(texts) | |
# print X_tfid_features[1] | |
X_literal_features = map(self.get_features, texts) | |
# print type(np.array(X_tfid_features)) | |
X_literal_features = np.concatenate(X_literal_features, axis=0) | |
# print [X_tfid_features.shape, X_literal_features.shape] | |
# print X_tfid_features_word.toarray(), type(X_tfid_features_word.toarray()) | |
X_tfid_features = hstack((X_tfid_features_word, X_tfid_features_pos)) | |
sp_literal_features = sparse.csr_matrix(X_literal_features) | |
X_features = hstack((X_tfid_features, sp_literal_features)) | |
return X_features | |
# return np.concatenate([X_tfid_features_word.toarray(), X_literal_features, X_tfid_features_pos.toarray()], | |
# axis=1) | |
return np.concatenate([X_tfid_features.toarray(), X_literal_features], | |
axis=1) | |
def predict(self, text): | |
return self.clf.predict(self.transform_text_into_feature_vector([text], transform=False))[0] | |
def convert_all_dataframe_from_csvs(self, filepaths): | |
df_prev = None | |
if isinstance(filepaths, (str, unicode)): | |
filepaths = [filepaths] | |
for filepath in filepaths: | |
df = pd.read_csv(filepath) | |
df = df.ix[:, [0, 1]] | |
df = df.loc[~pd.isnull(df.WORD)] | |
df = df.loc[~pd.isnull(df.LABEL)] | |
# print df.shape | |
if type(df_prev) == pd.DataFrame: | |
df = pd.concat([df, df_prev]) | |
df_prev = df | |
return df | |
class ExperienceTagger3(object): | |
def __init__(self, filepath=None, modelpath=None): | |
if filepath: | |
df = self.convert_all_dataframe_from_csvs(filepath) | |
# df = pd.read_csv(filepath) | |
# df = df.ix[:, [0, 1]] | |
# df = df.loc[~pd.isnull(df.WORD)] | |
# df = df.loc[~pd.isnull(df.LABEL)] | |
# self.tfid_word = TfidfVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b') | |
self.tfid_word = CountVectorizer(ngram_range=(1, 2), token_pattern='\\b[a-zA-Z]\. [a-zA-Z]+|\\b[a-zA-Z\.]+\\b') | |
# self.tfid_pos = TfidfVectorizer(ngram_range=(1, 2)) | |
self.tfid_pos = CountVectorizer(ngram_range=(1, 2)) | |
X = self.transform_text_into_feature_vector(texts=df.WORD) | |
y = df.LABEL | |
# self.clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=41) | |
self.clf = RandomForestClassifier(n_estimators=100, random_state=22) | |
self.clf.fit(X, y) | |
with open('experiencetaggermodel.bin', 'wb') as f: | |
pickle.dump([self.clf, self.tfid_pos, self.tfid_word], f) | |
elif modelpath: | |
with open(modelpath, 'rb') as f: | |
self.clf, self.tfid_pos, self.tfid_word = pickle.load(f) | |
def get_features(self, element, return_dictionary=False): | |
threshold_short = 3 | |
threshold_long = 5 | |
# print element | |
element = element.decode('utf-8') | |
feature_labels = ['size.short', 'size.medium', 'size.long', 'number.of.digits', | |
'size.words', 'no.of.NN', 'no.of.NNP', 'no.of.NNS'] | |
features = {label: 0 for label in feature_labels} | |
# if element.find('%') > -1: | |
# features.update({'contains.percentage.symbol': 1}) | |
splitted_words = re.split('[^\w\.%]+', element) | |
# if len(splitted_words) < threshold_short: | |
# features.update({'size.short': 1}) | |
# elif len(splitted_words) > threshold_long: | |
# features.update({'size.long': 1}) | |
# else: | |
# features.update({'size.medium': 1}) | |
# if re.search('\d+\.\d+', element): | |
# features.update({'number.is.decimal': 1}) | |
features.update({'number.of.digits': len(re.findall('\d', element))}) | |
# features.update({'contains.hyphens': element.count('-')}) | |
# if re.search('\\|\/', element): | |
# features.update({'contains.slash': 1}) | |
nn = [y for _, y in pos_tag(word_tokenize(element))] | |
features.update({'no.of.NN': nn.count('NN')}) | |
features.update({'no.of.NNP': nn.count('NNP')}) | |
features.update({'no.of.NNS': nn.count('NNS')}) | |
# features.update({'size.words': len(re.split('\W+', element))}) | |
if return_dictionary: | |
return features | |
return np.array([[x[1] for x in sorted(features.items(), key=lambda x: x[0])]]) | |
def transform_text_into_feature_vector(self, texts, transform=True): | |
pos_tags = map(lambda x: ' '.join(map(lambda x: x[1], x)), pos_tag_sents(map(word_tokenize, texts))) | |
if transform: | |
X_tfid_features_pos = self.tfid_pos.fit_transform(pos_tags) | |
X_tfid_features_word = self.tfid_word.fit_transform(texts) | |
else: | |
X_tfid_features_pos = self.tfid_pos.transform(pos_tags) | |
X_tfid_features_word = self.tfid_word.transform(texts) | |
# print X_tfid_features[1] | |
X_literal_features = map(self.get_features, texts) | |
# print type(np.array(X_tfid_features)) | |
X_literal_features = np.concatenate(X_literal_features, axis=0) | |
# print [X_tfid_features.shape, X_literal_features.shape] | |
# print X_tfid_features_word.toarray(), type(X_tfid_features_word.toarray()) | |
X_tfid_features = hstack((X_tfid_features_word, X_tfid_features_pos)) | |
sp_literal_features = sparse.csr_matrix(X_literal_features) | |
X_features = hstack((X_tfid_features, sp_literal_features)) | |
return X_features | |
return X_tfid_features.toarray() | |
# return np.concatenate([X_tfid_features_word.toarray(), X_literal_features, X_tfid_features_pos.toarray()], | |
# axis=1) | |
return np.concatenate([X_tfid_features.toarray(), X_literal_features], | |
axis=1) | |
def predict(self, text): | |
return self.clf.predict(self.transform_text_into_feature_vector([text], transform=False).toarray())[0] | |
def convert_all_dataframe_from_csvs(self, filepaths): | |
df_prev = None | |
if isinstance(filepaths, (str, unicode)): | |
filepaths = [filepaths] | |
for filepath in filepaths: | |
df = pd.read_csv(filepath) | |
df = df.ix[:, [0, 1]] | |
df = df.loc[~pd.isnull(df.WORD)] | |
df = df.loc[~pd.isnull(df.LABEL)] | |
# print df.shape | |
if type(df_prev) == pd.DataFrame: | |
df = pd.concat([df, df_prev]) | |
df_prev = df | |
return df | |
if __name__ == '__main__': | |
# hc = ExperienceTagger2(#modelpath='experiencetaggermodel.bin') | |
# filepath='data_corpus/experience_training_data.csv') # (modelpath="data_corpus/headingclassifier.bin") | |
experience_tagger = ExperienceTagger3( | |
filepath = ['data_corpus/experience_training_data.csv', 'data_corpus/experience_training_data_others.csv']) | |
print experience_tagger.predict(u'Affine Analytics') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import pandas as pd | |
from itertools import repeat | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer | |
import numpy as np | |
from nltk.corpus import stopwords | |
from sklearn.svm import SVC | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import AdaBoostClassifier | |
from sklearn.linear_model import SGDClassifier | |
import pickle | |
import pdfminer | |
from pdfminer.layout import * | |
filepath = "data_corpus/heading_clustered filtered.csv" | |
class HeadingClassifier(object): | |
def __init__(self, filepath=None, modelpath=None): | |
self.filepath = filepath | |
self.modelpath = modelpath | |
if not modelpath and not filepath: | |
assert "filepath and modelpath both cannot be None!" | |
if filepath: | |
self._train_model() | |
elif modelpath: | |
self._load_model() | |
else: | |
assert 'Pass modelpath or filepath' | |
def _train_model(self, save_filepath='data_corpus/headingclassifier.bin'): | |
df = pd.read_csv(self.filepath) | |
n_df = pd.DataFrame([(label, name.strip(' ')) for name in df for label in df.ix[pd.notnull(df.ix[:, name]), name]], columns=['X', 'y']) | |
n_df_shuffled = n_df.iloc[np.random.permutation(len(n_df))] | |
heading_clf = Pipeline([ | |
('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words=stopwords.words())), | |
('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) | |
# text_clf.fit(n_df_shuffled.X[: 200], n_df_shuffled.y[: 200]) | |
# text_clf.score(n_df_shuffled.X[200: ], n_df_shuffled.y[200: ]) | |
heading_clf.fit(n_df_shuffled.X, n_df_shuffled.y) | |
heading_clf.score(n_df_shuffled.X, n_df_shuffled.y) | |
self.heading_clf = heading_clf | |
pickle.dump(obj=self.heading_clf, file=open(save_filepath, 'wb')) | |
def _load_model(self): | |
self.heading_clf = pickle.load(open(self.modelpath, 'rb')) | |
def get_labels(self): | |
return list(self.heading_clf.classes_) | |
def heading_label(self, heading): | |
if isinstance(heading, LTTextLine): | |
heading = heading.get_text() | |
return self.heading_clf.predict([heading])[0] | |
def heading_labels(self, headings): | |
return list(self.heading_clf.predict(headings)) | |
if __name__ == '__main__': | |
hc = HeadingClassifier(filepath=filepath)#(modelpath="data_corpus/headingclassifier.bin") | |
print hc.heading_label(u'PROFESSIONAL\xa0EXPERIENCE:\xa0\n') | |
print hc.heading_label('Scholastic Experience') | |
print hc.heading_label('ACADEMIC') | |
print list(hc.heading_clf.classes_) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import time | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdfpage import PDFTextExtractionNotAllowed | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.pdfinterp import PDFPageInterpreter | |
from pdfminer.layout import LAParams | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.pdfdevice import PDFDevice, TagExtractor | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter | |
from pdfminer.cmapdb import CMapDB | |
from pdfminer.layout import LAParams | |
from pdfminer.image import ImageWriter | |
# from cStringIO import StringIO | |
from cStringIO import StringIO | |
def convert_pdf_to_text(infile, outfile=None): | |
assert infile.endswith('.pdf'), "Hola!! , Must be a .pdf file, at least it doesn't ends with one ;)" | |
debug = 0 | |
# input option | |
password = '' | |
pagenos = set() | |
maxpages = 1 | |
# output option | |
outtype = None | |
imagewriter = None | |
rotation = 0 | |
stripcontrol = False | |
layoutmode = 'normal' | |
codec = 'utf-8' | |
pageno = 1 | |
scale = 1 | |
caching = True | |
showpageno = True | |
laparams = LAParams() | |
PDFDocument.debug = debug | |
PDFParser.debug = debug | |
CMapDB.debug = debug | |
PDFPageInterpreter.debug = debug | |
# | |
rsrcmgr = PDFResourceManager(caching=caching) | |
print outfile | |
outfp = StringIO() | |
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, | |
imagewriter=imagewriter) | |
fp = file(infile, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
for page in PDFPage.get_pages(fp, pagenos, | |
maxpages=maxpages, password=password, | |
caching=caching, check_extractable=True): | |
page.rotate = (page.rotate + rotation) % 360 | |
interpreter.process_page(page) | |
content = outfp.getvalue() | |
fp.close() | |
device.close() | |
outfp.close() | |
return content | |
def extract_layout_by_page(pdf_path): | |
laparams = LAParams() | |
fp = open(pdf_path, 'rb') | |
parser = PDFParser(fp) | |
document = PDFDocument(parser) | |
if not document.is_extractable: | |
raise PDFTextExtractionNotAllowed | |
rsrcmgr = PDFResourceManager() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
layouts = [] | |
for page in PDFPage.create_pages(document): | |
interpreter.process_page(page) | |
layouts.append(device.get_result()) | |
return layouts | |
example_file = "resumes_data/Amit.Nandi_CV.pdf" | |
example_file = "/home/wolfram/project@work/resume-intent-classification/resumes_data/20160912 TusharSrivastava_CV.pdf" | |
# example_file = "resumes_data/Adarsh Mohan.pdf" | |
# example_file = "resumes_data/Manish_Chandra_Gupta.pdf" | |
# example_file = "docs_converted_to_pdfs/Abhishek Mangla Resume.pdf" | |
# example_file = "docs_converted_to_pdfs/Abinash Adhikary CV 1.pdf" | |
# example_file = "docs_converted_to_pdfs/(498771424) GANESH SHANKAR JADHAWARnew.pdf" | |
# example_file = "Emails/Aakarsh Prasad_resume (final)_2.pdf" | |
# example_file = "resumes_data/ParulKumar.pdf" | |
# example_file = "resumes_data/rahul khalkar (purchase engineer).pdf" | |
page_layouts = extract_layout_by_page(example_file) | |
text = convert_pdf_to_text(example_file, None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import pdfminer | |
from SectionExtractor.SectionExtractor import SectionExtractor | |
def groupby_tolerance(iterable, key=None, tolerance=0, issorted=False): | |
# And again ! | |
if not issorted: | |
iterable = sorted(iterable, key=key, reverse=True) | |
if not key: | |
key = lambda x: x | |
prev = None | |
tmp = [] | |
group = [] | |
for it in iterable: | |
if not prev: | |
prev = it | |
tmp.append(it) | |
continue | |
if abs(key(it) - key(prev)) <= tolerance: | |
tmp.append(it) | |
prev = it | |
else: | |
group.append(tmp) | |
tmp = [it] | |
prev = it | |
group.append(tmp) | |
return group | |
def is_textline_bold(textline): | |
pass | |
def get_bold_words(textline): | |
pass | |
def concatenate_sents(textlines): | |
map(lambda x: x.strip()) | |
pass | |
def get_grouped_textlines(textlines, print_lines=False): | |
def split_text_lines(text): | |
return re.split('[\s\:\|\+]{5,}', text) | |
def if_starts_with_special_character(text): | |
if isinstance(text, pdfminer.layout.LTTextLine): | |
text = text.get_text() | |
# if re.match(u'\d{2,}', text.strip()): | |
# return None | |
# print text, 'SPECIAL CHARACTER', re.match(u'[^a-zA-Z]+$', text.strip()) | |
return re.match(u'[^a-zA-Z]+', text.strip()) | |
def if_special_character(text): | |
if isinstance(text, pdfminer.layout.LTTextLine): | |
text = text.get_text() | |
return re.match(u'^[^a-zA-Z]+$', text.strip()) | |
def if_ends_with_fullstop(text): | |
if isinstance(text, pdfminer.layout.LTTextLine): | |
text = text.get_text() | |
return re.match(u'.*\.[\s]*$', text.strip(' ')) | |
def sentences_from_subsection(textlines, ifreversed=False): | |
if not ifreversed: | |
sorted(textlines, key=lambda x: x.y1, reverse=True) | |
# TODO : Analyse the sentence structure - to determine - IF heading ? | |
def check_if_heading(lines, n_words, max_len_xcoor, threshold_x_distance=20): | |
if len(lines) == 1: | |
if len(nltk.word_tokenize(lines[0])) < n_words and lines[0].x1 < max_len_xcoor: | |
pass | |
return split_text_lines(lines[0]) > 1 | |
lines = sorted(lines, key=x.x0) | |
diff = [line[idx+1].x0 - line[idx].x1 for idx, line in enumerate(lines)] | |
if any(filter(lambda x: x>20, diff)): | |
return True | |
def check_if_non_sentence_line(lines, n_letters=5): | |
# print lines, len(filter(lambda x: re.search('\w{%s,}' % n_letters, x.get_text()), lines)) | |
if len(filter(lambda x: len(re.findall('\w', x.get_text()))>n_letters, lines)) >= 2: | |
return True | |
return False | |
# filter(lambda x: re.search('\w+', x.get_text()), textlines) | |
textlines = sorted(textlines, key=lambda x: x.y1, reverse=True) | |
# print textlines | |
same_line_textlines_grouped = map(lambda x: sorted(x, key=lambda y: y.x0), groupby_tolerance(textlines, key=lambda x: (x.y0 + x.y1) / 2, tolerance=2)) | |
largest_line = max(textlines, key=lambda x: x.x1 - x.x0) | |
xtreme_xcoor = float(largest_line.x1 * 0.75) | |
grouped_textlines = [] | |
temp = [] | |
for line in same_line_textlines_grouped: | |
# if check_if_non_sentence_line(line): | |
# print line, '-------------------------------' | |
if check_if_non_sentence_line(line): | |
# print line, '------' | |
grouped_textlines.extend(map(lambda x: [x], line)) | |
continue | |
if if_special_character(line[0]) or if_starts_with_special_character(line[0]) or check_if_non_sentence_line(line): | |
# print 'CONDITION: 1' | |
if temp: | |
grouped_textlines.append(temp) | |
temp = [] | |
temp.extend(line) | |
if if_ends_with_fullstop(line[-1]) or line[-1].x1 <= xtreme_xcoor or check_if_non_sentence_line(line): | |
# print 'CONDITION: 2' | |
grouped_textlines.append(temp) | |
temp = [] | |
if temp: | |
grouped_textlines.append(temp) | |
if print_lines: | |
for line in grouped_textlines: | |
print '----------' | |
# print line | |
print ' '.join(map(lambda x: x.get_text().strip(), line)) | |
grouped_lines = [' '.join(map(lambda x: x.get_text().strip(), lines)) for lines in grouped_textlines] | |
return grouped_textlines, map(split_text_lines, grouped_lines) | |
# se = SectionExtractor('resumes_data/20160912 TusharSrivastava_CV.pdf') | |
se = SectionExtractor('resumes_data/Namrata Meena_IIT Kharagpur.pdf') | |
# se = SectionExtractor.SectionExtractor('resumes_data/Amit.Nandi_CV.pdf') | |
textlines = filter(lambda x: h_clf.heading_label(x[0].get_text()) == "EXPERIENCE", se.get_sections_from_resume(as_text=False).items())[0][1] | |
textlines = filter(lambda x: not re.match('^\s+$', x.get_text()), textlines) | |
print get_grouped_textlines(textlines, print_lines=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
remove_special_character = lambda x: re.sub(r'[^\x00-\x7F]+',' ', x) | |
candidate = map(remove_special_character, filter(lambda x: len(word_tokenize(x))<10, sum(grouped_lines, []))) | |
filter(lambda x: x[0] != 'OTHERS' , zip(map(experience_tagger.predict, candidate), candidate)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment