Skip to content

Instantly share code, notes, and snippets.

/edge_list.py Secret

Created December 28, 2017 17:37
Show Gist options
  • Save anonymous/2a6c841fe04ebc6d55acc259b4ac4f72 to your computer and use it in GitHub Desktop.
Save anonymous/2a6c841fe04ebc6d55acc259b4ac4f72 to your computer and use it in GitHub Desktop.
Network visualisation code
import csv
import wikipedia
import urllib.request
from bs4 import BeautifulSoup as BS
import re
# get a list of programming languages from Wikipedia
pageTitle = "List of programming languages"
nodes = list(wikipedia.page(pageTitle).links)
print(nodes)
# remove unwanted links
removeList = ["List of","Lists of","Timeline","Comparison of","History of","Esoteric programming language"]
nodes = [i for i in nodes if not any(r in i for r in removeList)]
# define some functions
base = "https://en.wikipedia.org/wiki/"
# get HTML
def getSoup(n):
try:
with urllib.request.urlopen(base+n) as response:
soup = BS(response.read(),'html.parser')
table = soup.find_all("table",class_="infobox vevent")[0]
return table
except:
pass
# get some metadata
def getYear(t):
try:
t = t.get_text()
year = t[t.find("appear"):t.find("appear")+30]
year = re.match(r'.*([1-3][0-9]{3})',year).group(1)
return int(year)
except:
return "Could not determine :("
# function to find the links for a given language
def getLinks(t):
try:
table_rows = t.find_all("tr")
for i in range(0,len(table_rows)-1):
try:
if table_rows[i].get_text() == "\nInfluenced\n":
out = []
for j in table_rows[i+1].find_all("a"):
try:
out.append(j['title'])
except:
continue
return out
except:
continue
return
except:
return
# create list objects to store data
edgeList = [["Source,Target"]]
meta = [["Id","Year"]]
# go through each node, use functions defined earlier to collect data and append to lists
for n in nodes:
try:
temp = getSoup(n)
except:
continue
try:
influenced = getLinks(temp)
for link in influenced:
if link in nodes:
edgeList.append([n+","+link])
print([n+","+link])
except:
continue
year = getYear(temp)
meta.append([n,year])
# finally - write CSV files to import into Gephi
with open("edge_list.csv","w") as f:
wr = csv.writer(f)
for e in edgeList:
wr.writerow(e)
with open("metadata.csv","w") as f2:
wr = csv.writer(f2)
for m in meta:
wr.writerow(m)
@Merlin822
Copy link

"year=re.match(r'.([1-3][0-9]{3})',year).group(1)" "year=re.match(r'.([1-3][0-9]{3})',year)" return "none";
"year=re.match(r'(/d{4})',year)" returns "none"; this is a result from the method of 'match';
but ".*" maybe not work;

so, “year=re.findall(r'(\d{4})',year)” return "['1987']"
“year=re.findall(r'(\d{4})', year).group(1)” return "'list' object has no attribute 'group'"

In summary,these makes me wonder. Wish you can give me some guidances.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment