Skip to content

Instantly share code, notes, and snippets.

@elendiastarman
Created July 28, 2016 18:01
Show Gist options
  • Save elendiastarman/faf00b9c43c0c937d314056360e08cdf to your computer and use it in GitHub Desktop.
Save elendiastarman/faf00b9c43c0c937d314056360e08cdf to your computer and use it in GitHub Desktop.
import urllib.request as ur
import html.parser as hp
class parser(hp.HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__()
self.numTags = 0
self.numText = 0
self.state = ""
self.names = {}
self.messages = {}
self.debug = kwargs['debug'] if 'debug' in kwargs else 0
if self.debug: print("Debug on.")
def handle_starttag(self, tag, attrs):
if self.state == "content":
if tag != "div":
self.messages[self.currMess]["content"] += "<%s>"%tag
else:
self.messages[self.currMess]["content"] = ("<%s>"%attrs[0][1]).replace(' ','_')
self.state = ""
return
if tag in ("div","a"):
self.numTags += 1
if 1 and self.debug:
print("tag:",tag)
print("attrs:",attrs)
if len(attrs) == 0 or attrs[0][0] != "class": return
if attrs[0][1].startswith("monologue"):
uid = int(attrs[0][1][15:].rstrip(" mine"))
self.currUser = uid
if uid not in self.names:
self.state = "need name"
elif attrs[0][1] == "message":
mid = int(attrs[1][1].split('-')[1])
self.messages[mid] = {'uid':self.currUser,
'name':self.names[self.currUser],
'content':""}
self.currMess = mid
elif attrs[0][1] == "reply-info":
rid = int(attrs[1][1].split('#')[1])
self.messages[self.currMess]["rid"] = rid
elif attrs[0][1] == "username" and self.state == "need name":
self.state = "get name"
elif attrs[0][1] == "content":
self.state = "content"
def handle_endtag(self, tag):
if self.state == "content":
if tag == "div":
self.state = ""
self.messages[self.currMess]["content"] = self.messages[self.currMess]["content"][:-40]
else:
self.messages[self.currMess]["content"] += "</%s>"%tag
def handle_data(self, data):
if self.state == "content":
if 1 and self.debug: print(" data:",data)
self.numText += 1
if self.messages[self.currMess]["content"]:
self.messages[self.currMess]["content"] += data
else:
self.messages[self.currMess]["content"] = data[22:]
if self.state == "get name":
self.state = ""
self.names[self.currUser] = data.strip()
def parseConvos(roomNum=240, year=2016, month=3, day=23, hourStart=0, hourEnd=4, debug=0):
urlTemp = "http://chat.stackexchange.com/transcript/"+"{}/"*4+"{}-{}"
url = urlTemp.format(*[roomNum, year, month, day, hourStart, hourEnd])
print(url)
text = ur.urlopen(url).read().decode('utf-8')
p = parser(debug=debug)
p.feed(text)
groups = []
for mid, val in sorted(p.messages.items()):
if "rid" in val:
rid = val["rid"]
found = 0
for g in groups:
if rid in g:
g.append(mid)
found = 1
if not found: groups.append([rid,mid])
groups2 = sorted(groups, key=lambda x:len(x), reverse=True)
return {'groups':groups, 'sorted':groups2, 'messages':p.messages}
if __name__ == '__main__':
## stuff = parseConvos(1, 2016, 7, 28, 3, 4, 0)
import matplotlib.pyplot as plt
for i in range(3):
stuff = parseConvos(240, 2016, 5+i, 27, 0, 23, 0)
wordage = [len(x['content'].split()) for x in stuff['messages'].values()]
fig = plt.figure(i)
sub = fig.add_subplot(111, aspect='equal')
## sub.hist(wordage, max(wordage))
bins = [0,0,0]
for x in wordage:
if x < 15:
bins[0] += 1
elif x < 30:
bins[1] += 1
else:
bins[2] += 1
sub.pie(bins, labels=('0-14','15-29','30+'), autopct='%1.1f%%')
## fig.set_axes([0.1,0.1,0.8,0.8])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment