Skip to content

Instantly share code, notes, and snippets.

@oliver-batey
Last active November 28, 2021 18:29
Show Gist options
  • Save oliver-batey/0a191806d828136ec242256e644fb1d0 to your computer and use it in GitHub Desktop.
Save oliver-batey/0a191806d828136ec242256e644fb1d0 to your computer and use it in GitHub Desktop.
Get relevant SVO patterns and sentences
import numpy as np
import pandas as pd
from textacy import extract, make_spacy_doc
# Load the entire article text
with open("news_article.txt", "r") as file:
data = file.read().replace("\n", "")
article = data.replace(u"\xa0", u" ")
# Create doc object
doc = make_spacy_doc(article, lang="en_core_web_sm")
# Top keyterms from each keyword extraction algorithm (known from previous step)
keyterms = ["stock", "elect Joe Biden", "Trump", "Chicago Fed President Charles Evan"]
# Get document sentences containing keyterms, save to dataframe
# doc.sents gives us a generator object that we can iterate over
# s.lemma_ returns the sentence with all tokens lemmatized
sentences = []
for term in keyterms:
keyterm_sentences = [s for s in doc.sents if term in s.lemma_]
for s in keyterm_sentences:
dictionary = {"keyterm": None, "sentence": None}
dictionary["keyterm"], dictionary["sentence"] = term, s
sentences.append(dictionary)
sentence_data = pd.DataFrame(sentences)
sentence_data.to_csv("sentence_data.csv")
# Extract SVO patterns relevant to keyterms using Textacy
# The below line returns a generator, SVOTriple object, which contains 3 lists.
SVOs = extract.subject_verb_object_triples(doc)
svos = []
# Filter the SVO patterns to find the ones mentioning keyterms
for term in keyterms:
for s in SVOs:
dictionary = {"keyterm": None, "svo": None}
svo_lemma = [t[0].lemma_ for t in s] # lemmatize words within svo patterns
if term in svo_lemma:
dictionary["keyterm"], dictionary["svo"] = term, [t for t in svo_lemma]
svos.append(dictionary)
else:
pass
svo_data = pd.DataFrame(svos)
svo_data.to_csv("svo_data.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment