This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hmni | |
# Initialize a Matcher Object | |
matcher = hmni.Matcher(model='latin') | |
# Single Pair Similarity | |
matcher.similarity('Alan', 'Al') | |
# 0.6838303319889133 | |
matcher.similarity('Alan', 'Al', prob=False) | |
# 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def wiki_page(page_name): | |
wiki_api = wikipediaapi.Wikipedia(language='en', | |
extract_format=wikipediaapi.ExtractFormat.WIKI) | |
page_name = wiki_api.page(page_name) | |
if not page_name.exists(): | |
print('Page {} does not exist.'.format(page_name)) | |
return | |
page_data = pd.DataFrame({ | |
'page': page_name, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
def prepro(pairs, filterout=None, hard_lmt=100000): | |
pairs['relation'] = 1 | |
G = nx.from_pandas_edgelist(pairs, 'subject', 'object', | |
create_using=nx.DiGraph()) | |
if filterout: | |
nodes = \ | |
list(set(pairs[~pairs.subject_type.isin(filterout)]['subject'].tolist() | |
+ pairs[~pairs.object_type.isin(filterout)]['object'].tolist())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def filter_graph(pairs, node): | |
k_graph = nx.from_pandas_edgelist(pairs, 'subject', 'object', | |
create_using=nx.MultiDiGraph()) | |
edges = nx.dfs_successors(k_graph, node) | |
nodes = [] | |
for k, v in edges.items(): | |
nodes.extend([k]) | |
nodes.extend(v) | |
subgraph = k_graph.subgraph(nodes) | |
layout = (nx.random_layout(k_graph)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
import spacy | |
import neuralcoref | |
nlp = spacy.load('en_core_web_lg') | |
neuralcoref.add_to_pipe(nlp) | |
def get_entity_pairs(text, coref=True): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import networkx as nx | |
import matplotlib.pyplot as plt | |
def draw_kg(pairs): | |
k_graph = nx.from_pandas_edgelist(pairs, 'subject', 'object', | |
create_using=nx.MultiDiGraph()) | |
node_deg = nx.degree(k_graph) | |
layout = nx.spring_layout(k_graph, k=0.15, iterations=20) | |
plt.figure(num=None, figsize=(120, 90), dpi=80) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wikipediaapi # pip install wikipedia-api | |
import pandas as pd | |
import concurrent.futures | |
from tqdm import tqdm | |
def wiki_scrape(topic_name, verbose=True): | |
def wiki_link(link): | |
try: | |
page = wiki_api.page(link) | |
if page.exists(): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
from bs4 import BeautifulSoup | |
import spacy | |
import neuralcoref | |
nlp = spacy.load('en_core_web_lg') | |
neuralcoref.add_to_pipe(nlp) | |
html = urllib.request.urlopen('https://www.law.cornell.edu/supremecourt/text/418/683').read() | |
soup = BeautifulSoup(html, 'html.parser') | |
text = ''.join([t for t in soup.find_all(text=True) if t.parent.name == 'p' and len(t) >= 25]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
venv .env | |
source .env/bin/activate | |
git clone https://github.com/huggingface/neuralcoref.git | |
cd neuralcoref | |
pip install -r requirements.txt | |
pip install -e . |