Skip to content

Instantly share code, notes, and snippets.

@ejcer
Created November 13, 2015 21:16
Show Gist options
  • Save ejcer/aab9517eb2bee9e8e800 to your computer and use it in GitHub Desktop.
Save ejcer/aab9517eb2bee9e8e800 to your computer and use it in GitHub Desktop.
import xml.etree.ElementTree as ET
from math import sqrt
import json
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import os
import io
os.chdir('/home/edward/workspace/school/context_slices')
os.getcwd()
#combine aliases to a uniform name
def filter(item, tag, aliases):
for alias in aliases:
if tag in alias and item in alias:
##print item, "--", alias[1]
return alias[1]
return item
#find co-occurance of items
def cooccur(item1, item2):
cooccurence = 0
for doc in root.iter('document'):
childtexts = []
for child in doc:
item = child.text
if item != None and child.tag != 'docID' and child.tag != 'docText':
childtexts.append(item)
if item1 in childtexts and item2 in childtexts:
#print item1, "& ", item2, " cooccur in ", doc[0].text
cooccurence += 1
return cooccurence
#build item-item matrix
# def item_item():
# item_item = []
# for pri_item in items:
# cooccurence = 0
# row = []
# #print pri_item
# for itr_item in items:
# #print itr_item
# cooccurence = cooccur(pri_item, itr_item)
# row.append(cooccurence)
# #print cooccurence
# item_item.append(row)
# return item_item
tree = ET.parse('datafiles/crescent.xml')
root = tree.getroot()
#build aliases dictionary
aliasedType = []
aliases = []
for name in root.iter('alias'):
pid = []
oid = []
alias = [name[0][0].tag,name[0][0].text]
#check if primary id and other id are of the same type
for n in name:
if "primary" in n.tag:
pid.append(n)
if "other" in n.tag:
oid.append(n)
alias.append(n[0].text)
##print "pid is ", pid, "oid is ", oid
aliases.append(alias)
for o in oid:
for p in pid:
if (o == p):
print "xml file error: id of different type ===> index: ", oid.index(o)
break
#extract the types that have aliases
elif p[0].tag not in aliasedType:
##print p[0].tag
aliasedType.append(name[0][0].tag);
##print aliasedType, aliases
#doc_item = doc_item()
#find items
items = []
for doc in root.iter('document'):
for child in doc:
item = child.text
if item not in items and item != None and child.tag != 'docID' and child.tag != 'docText':
if child.tag in aliasedType:
item = filter(item, child.tag, aliases)
items.append(item)
items.sort()
item_item = []
graph = {"nodes":[],"links":[]}
for idx, node_name in enumerate(items):
graph["nodes"].append({"group":idx,"name":node_name})
link_count = 0
for idx1, item1 in enumerate(items):
cooccurence = 0
row = []
for idx2, item2 in enumerate(items):
cooccurence = cooccur(item1, item2)
if cooccurence is 1 and link_count < 100:
graph["links"].append({"source":idx1,"target":idx2,"value":cooccurence})
link_count+=1
if cooccurence > 1:
graph["links"].append({"source":idx1,"target":idx2,"value":cooccurence})
link_count+=1
# for idx1, item1 in enumerate(items):
# cooccurence = 0
# row = []
# print "this happened"
# for idx2, item2 in enumerate(items):
# cooccurence = cooccur(item1, item2)
# row.append(cooccurence)
# item_item.append(row)
# print item_item
#
# df = DataFrame(item_item, columns = items, index=items)
# df.to_csv("item_item_dataframe.csv")
#node_links = {"nodes":[{"name":"Myriel","group":1}], "links":[{"source":1,"target":0,"value":1}]}
with io.open('text-team-webPrototype/graph.json', 'w', encoding='utf-8') as f:
f.write(unicode(json.dumps(graph, indent=4, separators=(',', ': '), ensure_ascii=False)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment