Skip to content

Instantly share code, notes, and snippets.

@Christopher-Thornton
Created December 21, 2019 17:21
Show Gist options
  • Save Christopher-Thornton/48df7dcb7da92aa66ab575c764c8f6ad to your computer and use it in GitHub Desktop.
Save Christopher-Thornton/48df7dcb7da92aa66ab575c764c8f6ad to your computer and use it in GitHub Desktop.
import urllib.request
from bs4 import BeautifulSoup
import spacy
import neuralcoref
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)
html = urllib.request.urlopen('https://www.law.cornell.edu/supremecourt/text/418/683').read()
soup = BeautifulSoup(html, 'html.parser')
text = ''.join([t for t in soup.find_all(text=True) if t.parent.name == 'p' and len(t) >= 25])
doc = nlp(text)
resolved_text = doc._.coref_resolved
sentences = [sent.string.strip() for sent in nlp(resolved_text).sents]
output = [sent for sent in sentences if 'president' in
(' '.join([token.lemma_.lower() for token in nlp(sent)]))]
print('Fact count:', len(output))
for fact in range(len(output)):
print(str(fact+1)+'.', output[fact])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment