Skip to content

Instantly share code, notes, and snippets.

@ajosanchez
Last active November 5, 2017 20:59
Show Gist options
  • Save ajosanchez/49d18e54b22f3f408d3557145564f81a to your computer and use it in GitHub Desktop.
Save ajosanchez/49d18e54b22f3f408d3557145564f81a to your computer and use it in GitHub Desktop.
extract all links which are people
from bs4 import BeautifulSoup as bs
import spacy
def hash_text(text, digits=8):
return hash(text) % (10 ** digits)
def make_edge_dict(unique_edges):
edge_dict = {}
for edge in unique_edges:
try:
edge_dict.update( {hash_text(edge.attrs['href']): edge.text} )
except:
print("edge error, possible selflink")
return edge_dict
def extract_edges(record):
person = record['name']
person_id = record['id']
html = bs(record['html'], 'lxml')
content = " ".join([p.text for p in html.select('#mw-content-text p')])
links = [p for p in html.select('#mw-content-text p a')]
tagged_content = nlp(content)
people = [e for e in tagged_content.ents if e.label_ == 'PERSON']
edges = [l for l in links if l.text in [p.text for p in people]]
unique_edges = list(set(edges))
return ({person_id: person}, make_edge_dict(unique_edges))
nlp = spacy.load('en_core_web_md')
adjacency_list = []
for record in articles:
adjacency_list.append(extract_edges(record))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment