Created
July 28, 2016 18:01
-
-
Save elendiastarman/faf00b9c43c0c937d314056360e08cdf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request as ur | |
import html.parser as hp | |
class parser(hp.HTMLParser): | |
def __init__(self, *args, **kwargs): | |
super().__init__() | |
self.numTags = 0 | |
self.numText = 0 | |
self.state = "" | |
self.names = {} | |
self.messages = {} | |
self.debug = kwargs['debug'] if 'debug' in kwargs else 0 | |
if self.debug: print("Debug on.") | |
def handle_starttag(self, tag, attrs): | |
if self.state == "content": | |
if tag != "div": | |
self.messages[self.currMess]["content"] += "<%s>"%tag | |
else: | |
self.messages[self.currMess]["content"] = ("<%s>"%attrs[0][1]).replace(' ','_') | |
self.state = "" | |
return | |
if tag in ("div","a"): | |
self.numTags += 1 | |
if 1 and self.debug: | |
print("tag:",tag) | |
print("attrs:",attrs) | |
if len(attrs) == 0 or attrs[0][0] != "class": return | |
if attrs[0][1].startswith("monologue"): | |
uid = int(attrs[0][1][15:].rstrip(" mine")) | |
self.currUser = uid | |
if uid not in self.names: | |
self.state = "need name" | |
elif attrs[0][1] == "message": | |
mid = int(attrs[1][1].split('-')[1]) | |
self.messages[mid] = {'uid':self.currUser, | |
'name':self.names[self.currUser], | |
'content':""} | |
self.currMess = mid | |
elif attrs[0][1] == "reply-info": | |
rid = int(attrs[1][1].split('#')[1]) | |
self.messages[self.currMess]["rid"] = rid | |
elif attrs[0][1] == "username" and self.state == "need name": | |
self.state = "get name" | |
elif attrs[0][1] == "content": | |
self.state = "content" | |
def handle_endtag(self, tag): | |
if self.state == "content": | |
if tag == "div": | |
self.state = "" | |
self.messages[self.currMess]["content"] = self.messages[self.currMess]["content"][:-40] | |
else: | |
self.messages[self.currMess]["content"] += "</%s>"%tag | |
def handle_data(self, data): | |
if self.state == "content": | |
if 1 and self.debug: print(" data:",data) | |
self.numText += 1 | |
if self.messages[self.currMess]["content"]: | |
self.messages[self.currMess]["content"] += data | |
else: | |
self.messages[self.currMess]["content"] = data[22:] | |
if self.state == "get name": | |
self.state = "" | |
self.names[self.currUser] = data.strip() | |
def parseConvos(roomNum=240, year=2016, month=3, day=23, hourStart=0, hourEnd=4, debug=0): | |
urlTemp = "http://chat.stackexchange.com/transcript/"+"{}/"*4+"{}-{}" | |
url = urlTemp.format(*[roomNum, year, month, day, hourStart, hourEnd]) | |
print(url) | |
text = ur.urlopen(url).read().decode('utf-8') | |
p = parser(debug=debug) | |
p.feed(text) | |
groups = [] | |
for mid, val in sorted(p.messages.items()): | |
if "rid" in val: | |
rid = val["rid"] | |
found = 0 | |
for g in groups: | |
if rid in g: | |
g.append(mid) | |
found = 1 | |
if not found: groups.append([rid,mid]) | |
groups2 = sorted(groups, key=lambda x:len(x), reverse=True) | |
return {'groups':groups, 'sorted':groups2, 'messages':p.messages} | |
if __name__ == '__main__': | |
## stuff = parseConvos(1, 2016, 7, 28, 3, 4, 0) | |
import matplotlib.pyplot as plt | |
for i in range(3): | |
stuff = parseConvos(240, 2016, 5+i, 27, 0, 23, 0) | |
wordage = [len(x['content'].split()) for x in stuff['messages'].values()] | |
fig = plt.figure(i) | |
sub = fig.add_subplot(111, aspect='equal') | |
## sub.hist(wordage, max(wordage)) | |
bins = [0,0,0] | |
for x in wordage: | |
if x < 15: | |
bins[0] += 1 | |
elif x < 30: | |
bins[1] += 1 | |
else: | |
bins[2] += 1 | |
sub.pie(bins, labels=('0-14','15-29','30+'), autopct='%1.1f%%') | |
## fig.set_axes([0.1,0.1,0.8,0.8]) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment