Skip to content

Instantly share code, notes, and snippets.

@christopherkullenberg
Last active January 18, 2016 19:15
Show Gist options
  • Save christopherkullenberg/9d44056a222d5ebbdcae to your computer and use it in GitHub Desktop.
Save christopherkullenberg/9d44056a222d5ebbdcae to your computer and use it in GitHub Desktop.
swepubgexf.py
from os import listdir
from lxml import etree as ET
from gexf import *
from itertools import combinations
#import xml.etree.ElementTree as ET #Use this if you don't have lxml installed
# Open up a gexf file
gexf = Gexf("Author-Institution network", "GU")
graph = gexf.addGraph("undirected", "static", "Swepub network")
attribute_node = graph.addNodeAttribute("University", "default_value", "string")
attribute_nodetwo = graph.addNodeAttribute("Institution", "default_value", "string")
records = 0 # Just a counter for control
therecords = [] # A list for storing many dictionaries created in the loop below
# The loop for extracting author/institution from the xml-files
for filename in listdir("GUYear2015N47761Searched20160110/"):
with open("GUYear2015N47761Searched20160110/" + filename) as currentFile:
tree = ET.parse(currentFile)
root = tree.getroot()
for child in root[0]:
records += 1 #Add to counter above
#print("-" * 10)
coauthors = {}
for c in child: #This iterates over the records
if c.get("tag") == "100": # The 100 Value is first author
authorlist = []
for value in c:
if value.get("code") == "a": # a is author name
author = value.text
#print(author)
elif value.get("code") == "u": # u is institution
#print(value.text)
institution = value.text
#print(institution)
coauthors.update({author: institution})
elif c.get("tag") == "700": # The 700 value is authors
for value in c:
if value.get("code") == "a":
author = value.text
#print(author)
elif value.get("code") == "u":
#print(value.text)
institution = value.text
#print(institution)
coauthors.update({author: institution})
therecords.append(coauthors) # Add each dictionary to the list above
#this removes only EXACT duplicate dictionaries from therecords list
#Possible false negative: Two articles may have identical groups of authors
seen = set()
therecordsdeduplicated = []
for d in therecords:
t = tuple(d.items())
if t not in seen:
seen.add(t)
therecordsdeduplicated.append(d)
#this creates a list which can be used to create edges
edges = []
coauthorcounter = 0 #just a counter
for t in therecordsdeduplicated:
if len(t) > 1: #This removes single author articles, only more than one authors are allowed
coauthorcounter += 1 # counts the number of articles with minimum 2 authors
#print("--")
#print(len(t))
edgelist = []
for key, value in t.items():
coauthoredge = list(combinations(t, 2)) #This function calculates all possible relations between authors of an article.
for c in coauthoredge:
edgelist.append(c)
newvalue = value.split(', ')
n = graph.addNode(key, key)
try:
print(newvalue[0])
n.addAttribute(attribute_node, newvalue[0])
except IndexError:
print("nothing here...")
try:
print(newvalue[1])
n.addAttribute(attribute_nodetwo, newvalue[1])
except IndexError:
print("nothing here...")
#graph.addNode(value, value) #This adds the Institutions as nodes
edges.append(edgelist)
#print(edgelist)
#this creates a list of edges then enumerates it and creates edges
authoredges = []
for e in edges:
for ee in e:
#print(ee)
authoredges.append(ee)
for enumer, e in enumerate(authoredges):
#print(enumer, e[0], e[1])
graph.addEdge(enumer, e[0], e[1])
# Print some meta-data
print("There are " + str(records) + " records")
print("There are " + str(coauthorcounter) + " co-authored articles (more than 1 author)\n")
#write file
gexf_file = open("coauthors.gexf", "wb")
gexf.write(gexf_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment