Skip to content

Instantly share code, notes, and snippets.

@suhdonghwi
Created April 6, 2020 11:42
Show Gist options
  • Save suhdonghwi/36a1a5787058119f0849c22a6fd3124a to your computer and use it in GitHub Desktop.
Save suhdonghwi/36a1a5787058119f0849c22a6fd3124a to your computer and use it in GitHub Desktop.
Analyzing a paradigm relationship between programming languages by parsing wikipedia documents
import requests
from bs4 import BeautifulSoup
import re
import graph_tool.all as gt
def get_html_content(url):
r = requests.get(url)
return r.text
inf_by_regex = re.compile('Influenced by')
inf_to_regex = re.compile('Influenced$')
def is_valid(href):
return href.startswith('/wiki/') and not href.startswith(('/wiki/Category:', '/wiki/Wikipedia:'))
def get_links(soup, regex):
subtitle = soup.find(text=regex)
if subtitle is None:
return []
links = subtitle.find_next('tr').td.findChildren('a')
return [href[6:] for tag in links if is_valid(href := tag['href'])]
def simplify_name(name):
removed = re.sub(r'\([^)]*\)', '', name).strip()
if len(removed) > 20:
return ''.join(map(lambda x: x[0], removed.split(' ')))
return removed
def get_title(href):
query = requests.get(r'https://en.wikipedia.org/w/api.php?action=query&titles={}&&redirects&format=json'.format(href))
return query.text.split('"')[-2]
graph = gt.Graph()
lang_prop = graph.new_vertex_property('string')
graph.vp.lang = lang_prop
def visit(href):
raw_name = get_title(href)
name = simplify_name(raw_name)
for v in graph.vertices():
if name == lang_prop[v]:
return v
v = graph.add_vertex()
graph.vp.lang[v] = name
print("{} ({})".format(name, raw_name))
content = get_html_content('https://en.wikipedia.org/wiki/' + href)
soup = BeautifulSoup(content, 'html.parser')
for link in get_links(soup, inf_to_regex):
dest_v = visit(link)
graph.add_edge(v, dest_v)
for link in get_links(soup, inf_by_regex):
source_v = visit(link)
graph.add_edge(source_v, v)
return v
if __name__ == "__main__":
# visit('C_(programming_language)')
# graph.save("graph.xml.gz")
graph = gt.load_graph("graph.xml.gz")
root = 0
for v in graph.vertices():
if graph.vp.lang[v] == "C":
root = v
break
graph.save("graph.xml.gz")
pos = gt.radial_tree_layout(graph, root=root, weighted=True)
# pos = gt.arf_layout(graph)
# pos = gt.sfdp_layout(graph, mu=10.0, mu_p=0.5)
# pos = gt.fruchterman_reingold_layout(graph, r=10)
deg = graph.degree_property_map("out")
sim = gt.vertex_similarity(graph, "jaccard")
color = graph.new_vp("double")
color.a = sim[0].a
state = gt.minimize_nested_blockmodel_dl(graph, deg_corr=True)
gt.graph_draw(graph,
pos=pos,
vorder=deg,
vertex_text=graph.vp.lang,
vertex_text_position=-10,
vertex_text_color="black",
vertex_font_size=gt.prop_to_size(deg, mi=1, ma=7),
# vertex_font_size=2,
vertex_size=gt.prop_to_size(deg, mi=3, ma=25),
# vertex_size=5,
edge_marker_size=1,
edge_pen_width=0.1,
vertex_fill_color=color,
output="graph-draw-arf.pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment