PandaWhoCodes/extract_entity.py

## extract_entity.py
import spacy
import pandas as pd
import sys
import re

nlp = spacy.load("en_core_web_md")


class Error(Exception):
    """Base class for other exceptions"""
    pass


class NoTweetColumnError(Error):
    """Raised when the there is no tweet column in given csv input"""
    pass


def get_entities(text):
    """
    Extracts named entities from a given text
    :param text: String
    :return: List of lists [[string,names entity]]
    """
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append([ent.text, ent.label_])
    return entities


def get_csv(csv_name):
    """
    Converts a CSV file into a pandas dataframe
    :param csv_name: csv file name
    :return: pandas dataframe
    """
    return pd.read_csv(csv_name, encoding="utf-8")


def get_text(tweets):
    """
    returns a list of all tweets froma given dataframe
    :param tweets: dataframe of tweet dataset
    :return: list of tweet text
    """
    try:
        if 'tweet' in tweets.columns:
            return tweets["tweet"]
        elif 'text' in tweets.columns:
            return tweets["tweet"]
        elif "Tweet" in tweets.columns:
            return tweets["tweet"]
        elif "Text" in tweets.columns:
            return tweets["Text"]
        else:
            raise NoTweetColumnError
    except NoTweetColumnError:
        print("No tweet or text column in given csv input")
        return None


def text_cleanup(text):
    '''
    Text pre-processing
        return tokenized list of cleaned words
    '''
    # Convert to lowercase
    # text_clean = text.lower()
    # Remove non-alphabet
    text_clean = text.replace("RT","")

    text_clean = re.sub(r'[^a-zA-Z]|(\w+:\/\/\S+)', ' ', text_clean).split()
    # Remove short words (length < 3)
    text_clean = " ".join(text_clean)
    # Lemmatize text with the appropriate POS tag
    # lemmatizer = WordNetLemmatizer()
    # text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean]
    # Filter out stop words in English
    # stops = set(stopwords).union(set(additional_stop_words))
    # text_clean = list(set([w for w in text_clean if w not in stops]))
    return text_clean


if __name__ == '__main__':
    filename = sys.argv[1]
    if not filename or filename.split(".")[-1] != "csv":
        print("Input a csv filename")
        print("python extract_entity.py filename.csv to_file.csv")
        sys.exit(1)
    to_file = sys.argv[2]
    tweets = get_csv(filename)
    text_list = get_text(tweets)
    entities = list()
    if len(text_list):
        for text in text_list:
            entities.extend(get_entities(text_cleanup(text)))
        entity_df = pd.DataFrame(entities, columns=["word", "entity"])
        entity_df.to_csv(to_file,index=False)
	import spacy
	import pandas as pd
	import sys
	import re

	nlp = spacy.load("en_core_web_md")


	class Error(Exception):
	"""Base class for other exceptions"""
	pass


	class NoTweetColumnError(Error):
	"""Raised when the there is no tweet column in given csv input"""
	pass


	def get_entities(text):
	"""
	Extracts named entities from a given text
	:param text: String
	:return: List of lists [[string,names entity]]
	"""
	doc = nlp(text)
	entities = []
	for ent in doc.ents:
	entities.append([ent.text, ent.label_])
	return entities


	def get_csv(csv_name):
	"""
	Converts a CSV file into a pandas dataframe
	:param csv_name: csv file name
	:return: pandas dataframe
	"""
	return pd.read_csv(csv_name, encoding="utf-8")


	def get_text(tweets):
	"""
	returns a list of all tweets froma given dataframe
	:param tweets: dataframe of tweet dataset
	:return: list of tweet text
	"""
	try:
	if 'tweet' in tweets.columns:
	return tweets["tweet"]
	elif 'text' in tweets.columns:
	return tweets["tweet"]
	elif "Tweet" in tweets.columns:
	return tweets["tweet"]
	elif "Text" in tweets.columns:
	return tweets["Text"]
	else:
	raise NoTweetColumnError
	except NoTweetColumnError:
	print("No tweet or text column in given csv input")
	return None


	def text_cleanup(text):
	'''
	Text pre-processing
	return tokenized list of cleaned words
	'''
	# Convert to lowercase
	# text_clean = text.lower()
	# Remove non-alphabet
	text_clean = text.replace("RT","")

	text_clean = re.sub(r'[^a-zA-Z]\|(\w+:\/\/\S+)', ' ', text_clean).split()
	# Remove short words (length < 3)
	text_clean = " ".join(text_clean)
	# Lemmatize text with the appropriate POS tag
	# lemmatizer = WordNetLemmatizer()
	# text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean]
	# Filter out stop words in English
	# stops = set(stopwords).union(set(additional_stop_words))
	# text_clean = list(set([w for w in text_clean if w not in stops]))
	return text_clean


	if __name__ == '__main__':
	filename = sys.argv[1]
	if not filename or filename.split(".")[-1] != "csv":
	print("Input a csv filename")
	print("python extract_entity.py filename.csv to_file.csv")
	sys.exit(1)
	to_file = sys.argv[2]
	tweets = get_csv(filename)
	text_list = get_text(tweets)
	entities = list()
	if len(text_list):
	for text in text_list:
	entities.extend(get_entities(text_cleanup(text)))
	entity_df = pd.DataFrame(entities, columns=["word", "entity"])
	entity_df.to_csv(to_file,index=False)