Skip to content

Instantly share code, notes, and snippets.

@daemonfire300
Created December 14, 2014 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daemonfire300/ae214eb1a0ca29703be1 to your computer and use it in GitHub Desktop.
Save daemonfire300/ae214eb1a0ca29703be1 to your computer and use it in GitHub Desktop.
class MyTagsHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self.tags = dict()
self.edges = dict()
self.k = 0
def assemble_tag_list(self, s):
s = s.replace("<", "")
s = s.replace(">", " ")
s = s.strip()
s = s.split(" ")
return s
def update_tag(self, post):
raw_tags = post.getValue(u"Tags")
tags = self.assemble_tag_list(raw_tags)
for t in tags:
t = t.strip()
if t == " " or t == "":
continue
if t not in self.tags:
self.tags[t] = list()
if t not in self.edges:
self.edges[t] = dict()
for o in tags:
o = o.strip()
if o == t:
continue
if o == " " or o == "":
continue
if o not in self.edges[t]:
self.edges[t][o] = 1
else:
self.edges[t][o] += 1
def startElement(self, name, attrs):
if u"Id" in attrs.getNames() and u"OwnerUserId" in attrs.getNames() and u"Tags" in attrs.getNames():
self.update_tag(attrs)
def create_tag_graph(el):
g = ig.Graph(directed = False)
print(g)
tags = set()
edges = list()
weight_list = dict()
for t, e in el.items():
#print(t, e)
tags.add(t)
for k, v in e.items():
tags.add(k)
if t == k:
continue
edge = (t,k)
edges.append(edge)
if (t,k) in weight_list:
weight_list[(t,k)] += v
elif (k,t) not in weight_list:
weight_list[(t,k)] = v
elif (k,t) in weight_list:
weight_list[(k,t)] += v
elif (k,t) not in weight_list:
weight_list[(k,t)] = v
g.add_vertices(list(tags))
print("len(el)", len(edges))
g.add_edges(edges)
print("len(el)", len(g.es))
g.es["weight"] = list(weight_list.values())
print("len(wl.v)", len(weight_list.values()))
print(g.es[10].source, g.es[10].target, g.es[10].index, g.es[10]["weight"])
print(len(list(tags)))
print(len(g.vs))
#print(g)
#g.write_svg("sna5.svg", width=1600, height=900, labels='name')
pr = g.pagerank(directed=False)
#prs = g.pagerank(directed=False)
#sorted(prs, reverse = True)
#print("TopPR:", prs[0],prs[1],prs[2], prs[3], prs[4])
for i in range(3):
#upr = prs[i]
upr = max(pr)
pr_idx = pr.index(upr)
print("PageRank ", upr, " of tag ", g.vs[pr_idx]["name"])
pr[pr_idx] = -1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment