Last active
November 28, 2021 18:29
-
-
Save oliver-batey/0a191806d828136ec242256e644fb1d0 to your computer and use it in GitHub Desktop.
Get relevant SVO patterns and sentences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from textacy import extract, make_spacy_doc | |
# Load the entire article text | |
with open("news_article.txt", "r") as file: | |
data = file.read().replace("\n", "") | |
article = data.replace(u"\xa0", u" ") | |
# Create doc object | |
doc = make_spacy_doc(article, lang="en_core_web_sm") | |
# Top keyterms from each keyword extraction algorithm (known from previous step) | |
keyterms = ["stock", "elect Joe Biden", "Trump", "Chicago Fed President Charles Evan"] | |
# Get document sentences containing keyterms, save to dataframe | |
# doc.sents gives us a generator object that we can iterate over | |
# s.lemma_ returns the sentence with all tokens lemmatized | |
sentences = [] | |
for term in keyterms: | |
keyterm_sentences = [s for s in doc.sents if term in s.lemma_] | |
for s in keyterm_sentences: | |
dictionary = {"keyterm": None, "sentence": None} | |
dictionary["keyterm"], dictionary["sentence"] = term, s | |
sentences.append(dictionary) | |
sentence_data = pd.DataFrame(sentences) | |
sentence_data.to_csv("sentence_data.csv") | |
# Extract SVO patterns relevant to keyterms using Textacy | |
# The below line returns a generator, SVOTriple object, which contains 3 lists. | |
SVOs = extract.subject_verb_object_triples(doc) | |
svos = [] | |
# Filter the SVO patterns to find the ones mentioning keyterms | |
for term in keyterms: | |
for s in SVOs: | |
dictionary = {"keyterm": None, "svo": None} | |
svo_lemma = [t[0].lemma_ for t in s] # lemmatize words within svo patterns | |
if term in svo_lemma: | |
dictionary["keyterm"], dictionary["svo"] = term, [t for t in svo_lemma] | |
svos.append(dictionary) | |
else: | |
pass | |
svo_data = pd.DataFrame(svos) | |
svo_data.to_csv("svo_data.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment