Skip to content

Instantly share code, notes, and snippets.

@abmathewks
Created August 17, 2020 22:57
Show Gist options
  • Save abmathewks/7234e4377882e79cee08ad5d9e967867 to your computer and use it in GitHub Desktop.
Save abmathewks/7234e4377882e79cee08ad5d9e967867 to your computer and use it in GitHub Desktop.
### text_extraction.py
#################################################################################
#################################################################################
### PRELIMINARIES
import os
os.chdir("/Users/abrahammathew/Desktop/LZ_demo_poc/test_data")
import re
import glob
import pandas as pd
import numpy as np
import nltk
#################################################################################
#################################################################################
### USER DEFINED FUNCTIONS
def clean_str(string):
"""
Tokenization/string cleaning for datasets.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
##########################################################################
##########################################################################
### EXECUTE ENTITY EXTRACTION
full_df = pd.DataFrame()
# filename="00001153_ar.txt"
for filename in glob.glob("*.txt"):
print("")
print("")
print(filename)
with open(filename, 'r') as f:
str_output = f.readlines()
#str_output = clean_str(str(str_output))
str_output = str(str_output)
sentences = nltk.sent_tokenize(str_output)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(str_output))):
if hasattr(chunk, 'label'):
print(chunk.label(), ' '.join(c[0] for c in chunk))
dat_ind = pd.DataFrame([filename])
# Print all entity names
print(entity_names)
# Print unique entity names
#print(set(entity_names))
dat_ind["entity_name"] = [entity_names]
full_df = full_df.append(dat_ind)
##########################################################################
##########################################################################
### SAVE OUTPUT
full_df.head(10)
#full_df.tocsv("lz_text_extraction_output1.csv")
##########################################################################
##########################################################################
##########################################################################
##########################################################################
##########################################################################
##########################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment