Skip to content

Instantly share code, notes, and snippets.

@PandaWhoCodes
Last active March 3, 2020 09:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PandaWhoCodes/a871f13ae956c192080ead7159b146f7 to your computer and use it in GitHub Desktop.
Save PandaWhoCodes/a871f13ae956c192080ead7159b146f7 to your computer and use it in GitHub Desktop.
To install spacy en_core_web_md- python -m spacy download en_core_web_md
import spacy
import pandas as pd
import sys
import re
nlp = spacy.load("en_core_web_md")
class Error(Exception):
"""Base class for other exceptions"""
pass
class NoTweetColumnError(Error):
"""Raised when the there is no tweet column in given csv input"""
pass
def get_entities(text):
"""
Extracts named entities from a given text
:param text: String
:return: List of lists [[string,names entity]]
"""
doc = nlp(text)
entities = []
for ent in doc.ents:
entities.append([ent.text, ent.label_])
return entities
def get_csv(csv_name):
"""
Converts a CSV file into a pandas dataframe
:param csv_name: csv file name
:return: pandas dataframe
"""
return pd.read_csv(csv_name, encoding="utf-8")
def get_text(tweets):
"""
returns a list of all tweets froma given dataframe
:param tweets: dataframe of tweet dataset
:return: list of tweet text
"""
try:
if 'tweet' in tweets.columns:
return tweets["tweet"]
elif 'text' in tweets.columns:
return tweets["tweet"]
elif "Tweet" in tweets.columns:
return tweets["tweet"]
elif "Text" in tweets.columns:
return tweets["Text"]
else:
raise NoTweetColumnError
except NoTweetColumnError:
print("No tweet or text column in given csv input")
return None
def text_cleanup(text):
'''
Text pre-processing
return tokenized list of cleaned words
'''
# Convert to lowercase
# text_clean = text.lower()
# Remove non-alphabet
text_clean = text.replace("RT","")
text_clean = re.sub(r'[^a-zA-Z]|(\w+:\/\/\S+)', ' ', text_clean).split()
# Remove short words (length < 3)
text_clean = " ".join(text_clean)
# Lemmatize text with the appropriate POS tag
# lemmatizer = WordNetLemmatizer()
# text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean]
# Filter out stop words in English
# stops = set(stopwords).union(set(additional_stop_words))
# text_clean = list(set([w for w in text_clean if w not in stops]))
return text_clean
if __name__ == '__main__':
filename = sys.argv[1]
if not filename or filename.split(".")[-1] != "csv":
print("Input a csv filename")
print("python extract_entity.py filename.csv to_file.csv")
sys.exit(1)
to_file = sys.argv[2]
tweets = get_csv(filename)
text_list = get_text(tweets)
entities = list()
if len(text_list):
for text in text_list:
entities.extend(get_entities(text_cleanup(text)))
entity_df = pd.DataFrame(entities, columns=["word", "entity"])
entity_df.to_csv(to_file,index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment