Last active
March 3, 2020 09:50
-
-
Save PandaWhoCodes/a871f13ae956c192080ead7159b146f7 to your computer and use it in GitHub Desktop.
To install spacy en_core_web_md- python -m spacy download en_core_web_md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import pandas as pd | |
import sys | |
import re | |
nlp = spacy.load("en_core_web_md") | |
class Error(Exception): | |
"""Base class for other exceptions""" | |
pass | |
class NoTweetColumnError(Error): | |
"""Raised when the there is no tweet column in given csv input""" | |
pass | |
def get_entities(text): | |
""" | |
Extracts named entities from a given text | |
:param text: String | |
:return: List of lists [[string,names entity]] | |
""" | |
doc = nlp(text) | |
entities = [] | |
for ent in doc.ents: | |
entities.append([ent.text, ent.label_]) | |
return entities | |
def get_csv(csv_name): | |
""" | |
Converts a CSV file into a pandas dataframe | |
:param csv_name: csv file name | |
:return: pandas dataframe | |
""" | |
return pd.read_csv(csv_name, encoding="utf-8") | |
def get_text(tweets): | |
""" | |
returns a list of all tweets froma given dataframe | |
:param tweets: dataframe of tweet dataset | |
:return: list of tweet text | |
""" | |
try: | |
if 'tweet' in tweets.columns: | |
return tweets["tweet"] | |
elif 'text' in tweets.columns: | |
return tweets["tweet"] | |
elif "Tweet" in tweets.columns: | |
return tweets["tweet"] | |
elif "Text" in tweets.columns: | |
return tweets["Text"] | |
else: | |
raise NoTweetColumnError | |
except NoTweetColumnError: | |
print("No tweet or text column in given csv input") | |
return None | |
def text_cleanup(text): | |
''' | |
Text pre-processing | |
return tokenized list of cleaned words | |
''' | |
# Convert to lowercase | |
# text_clean = text.lower() | |
# Remove non-alphabet | |
text_clean = text.replace("RT","") | |
text_clean = re.sub(r'[^a-zA-Z]|(\w+:\/\/\S+)', ' ', text_clean).split() | |
# Remove short words (length < 3) | |
text_clean = " ".join(text_clean) | |
# Lemmatize text with the appropriate POS tag | |
# lemmatizer = WordNetLemmatizer() | |
# text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean] | |
# Filter out stop words in English | |
# stops = set(stopwords).union(set(additional_stop_words)) | |
# text_clean = list(set([w for w in text_clean if w not in stops])) | |
return text_clean | |
if __name__ == '__main__': | |
filename = sys.argv[1] | |
if not filename or filename.split(".")[-1] != "csv": | |
print("Input a csv filename") | |
print("python extract_entity.py filename.csv to_file.csv") | |
sys.exit(1) | |
to_file = sys.argv[2] | |
tweets = get_csv(filename) | |
text_list = get_text(tweets) | |
entities = list() | |
if len(text_list): | |
for text in text_list: | |
entities.extend(get_entities(text_cleanup(text))) | |
entity_df = pd.DataFrame(entities, columns=["word", "entity"]) | |
entity_df.to_csv(to_file,index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment