abmathewks/sample.py

## sample.py
### text_extraction.py

#################################################################################
#################################################################################
### PRELIMINARIES

import os
os.chdir("/Users/abrahammathew/Desktop/LZ_demo_poc/test_data")

import re
import glob
import pandas as pd
import numpy as np
import nltk

#################################################################################
#################################################################################
### USER DEFINED FUNCTIONS

def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

##########################################################################
##########################################################################
### EXECUTE ENTITY EXTRACTION

full_df = pd.DataFrame()

# filename="00001153_ar.txt"
for filename in glob.glob("*.txt"):
    print("")
    print("")
    print(filename)
    with open(filename, 'r') as f:
        str_output = f.readlines()

    #str_output = clean_str(str(str_output))
    str_output = str(str_output)
    sentences = nltk.sent_tokenize(str_output)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(str_output))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

    dat_ind = pd.DataFrame([filename])

    # Print all entity names
    print(entity_names)
    # Print unique entity names
    #print(set(entity_names))

    dat_ind["entity_name"] = [entity_names]

    full_df = full_df.append(dat_ind)

##########################################################################
##########################################################################
### SAVE OUTPUT

full_df.head(10)

#full_df.tocsv("lz_text_extraction_output1.csv")

##########################################################################
##########################################################################
##########################################################################
##########################################################################
##########################################################################
##########################################################################
	### text_extraction.py

	#################################################################################
	#################################################################################
	### PRELIMINARIES

	import os
	os.chdir("/Users/abrahammathew/Desktop/LZ_demo_poc/test_data")

	import re
	import glob
	import pandas as pd
	import numpy as np
	import nltk

	#################################################################################
	#################################################################################
	### USER DEFINED FUNCTIONS

	def clean_str(string):
	"""
	Tokenization/string cleaning for datasets.
	Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
	"""
	string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
	string = re.sub(r"\'s", " \'s", string)
	string = re.sub(r"\'ve", " \'ve", string)
	string = re.sub(r"n\'t", " n\'t", string)
	string = re.sub(r"\'re", " \'re", string)
	string = re.sub(r"\'d", " \'d", string)
	string = re.sub(r"\'ll", " \'ll", string)
	string = re.sub(r",", " , ", string)
	string = re.sub(r"!", " ! ", string)
	string = re.sub(r"\(", " \( ", string)
	string = re.sub(r"\)", " \) ", string)
	string = re.sub(r"\?", " \? ", string)
	string = re.sub(r"\s{2,}", " ", string)
	return string.strip().lower()

	def extract_entity_names(t):
	entity_names = []

	if hasattr(t, 'label') and t.label:
	if t.label() == 'NE':
	entity_names.append(' '.join([child[0] for child in t]))
	else:
	for child in t:
	entity_names.extend(extract_entity_names(child))

	return entity_names

	##########################################################################
	##########################################################################
	### EXECUTE ENTITY EXTRACTION

	full_df = pd.DataFrame()

	# filename="00001153_ar.txt"
	for filename in glob.glob("*.txt"):
	print("")
	print("")
	print(filename)
	with open(filename, 'r') as f:
	str_output = f.readlines()

	#str_output = clean_str(str(str_output))
	str_output = str(str_output)
	sentences = nltk.sent_tokenize(str_output)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
	chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

	entity_names = []
	for tree in chunked_sentences:
	entity_names.extend(extract_entity_names(tree))

	for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(str_output))):
	if hasattr(chunk, 'label'):
	print(chunk.label(), ' '.join(c[0] for c in chunk))

	dat_ind = pd.DataFrame([filename])

	# Print all entity names
	print(entity_names)
	# Print unique entity names
	#print(set(entity_names))

	dat_ind["entity_name"] = [entity_names]

	full_df = full_df.append(dat_ind)

	##########################################################################
	##########################################################################
	### SAVE OUTPUT

	full_df.head(10)

	#full_df.tocsv("lz_text_extraction_output1.csv")

	##########################################################################
	##########################################################################
	##########################################################################
	##########################################################################
	##########################################################################
	##########################################################################