Skip to content

Instantly share code, notes, and snippets.

@svenski
Created April 6, 2019 10:30
Show Gist options
  • Save svenski/a433a823511a0f9a0941deba93fa0d2f to your computer and use it in GitHub Desktop.
Save svenski/a433a823511a0f9a0941deba93fa0d2f to your computer and use it in GitHub Desktop.
NLP dojo on finding characters in Emma
import spacy
import nltk
from nltk.corpus import gutenberg
#nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
import random
import pandas as pd
import matplotlib.pyplot as plt
import re
def main():
emma = gutenberg.raw('austen-emma.txt')
#TODO: replace -- with space
emma = emma.replace('--', ' ')
parsed_emma = nlp(emma)
my_sample = list(parsed_emma.sents)
sample=[]
for sent in my_sample:
sent = re.sub("\s+"," ",sent.text) # clean up the whitespace
sample.append(sent)
entities=[]
type_entity=[]
sentences=[]
for sent in sample:
parsed_sentence=nlp(sent)
for ent in parsed_sentence.ents:
entities.append(ent.text)
sentences.append(sent)
type_entity.append(ent.label_)
ents=pd.DataFrame({'Sentence':sentences,'Entity':entities,'Entity_type':type_entity})
people = ents[ents.Entity_type=='PERSON']
people_in_sent = people.groupby('Sentence')['Entity'].agg({set}).reset_index()
people_in_sent['num_person'] = people_in_sent.set.apply(lambda x : len(x))
mult = people_in_sent[people_in_sent.num_person == 2]
mult['pp'] = mult.set.apply(lambda x: tuple(list(x)))
tt = mult.groupby('pp').size().to_frame('count').sort_values('count', ascending = False).head(n = 10).reset_index()
import networkx as nx
import matplotlib.pyplot as plt
G=nx.Graph()
for _, row in tt.iterrows():
G.add_edge(row.pp[0], row.pp[1], weight=row['count'])
nx.draw(G)
@wkostelecki
Copy link

need to fix line 41 to mult = people_in_sent[people_in_sent.num_person == 2].copy()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment