Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import urllib.request as ur
import html.parser as hp
from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
import datetime
from transcriptAnalyzer.models import *
class parser(hp.HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__()
self.numTags = 0
self.numText = 0
self.state = ""
self.timestamp = None
self.divNest = 0
self.names = {}
self.messages = {}
self.debug = kwargs['debug'] if 'debug' in kwargs else 0
if self.debug: print("Debug on.")
def handle_starttag(self, tag, attrs):
if self.state == "content":
if tag != "div":
self.currMess["content"] += "<%s>"%tag
else:
self.currMess["onebox"] = attrs[0][1][10:] #takes out 'onebox ob-'
self.state = "onebox"
elif self.state == "onebox":
if tag == "div": self.divNest += 1
self.currMess["content"] += "<{0} {1}>".format(tag, ' '.join('%s="%s"'%attr for attr in attrs))
elif tag in ("div","a"):
if 1 and self.debug:
print("tag:",tag)
print("attrs:",attrs)
if len(attrs) == 0 or attrs[0][0] != "class": return
if attrs[0][1].startswith("monologue"):
uid = attrs[0][1][15:].rstrip(" mine")
self.currUser = int(uid) if uid else None
if uid not in self.names:
self.state = "need name"
elif attrs[0][1] == "message":
mid = int(attrs[1][1].split('-')[1])
self.messages[mid] = {'uid':self.currUser,
'rid':None,
'name':self.names[self.currUser],
'stars':0,
'onebox':"",
'content':"",
'timestamp':self.timestamp}
self.currMess = self.messages[mid]
elif attrs[0][1] == "reply-info":
rid = int(attrs[1][1].split('#')[1])
self.currMess["rid"] = rid
elif attrs[0][1] == "username" and self.state == "need name":
self.state = "get name"
elif attrs[0][1] == "content":
self.state = "content"
self.divNest = 0
elif attrs[0][1] == "timestamp":
self.state = "get time"
elif tag == "span" and len(attrs) and attrs[0] == ('class','times'):
self.state = "get stars"
def handle_endtag(self, tag):
if self.state == "content":
if tag == "div":
if self.divNest == 0:
self.state = ""
self.currMess["content"] = self.currMess["content"][:-40]
else:
self.divNest -= 1
self.currMess["content"] += "</div>"
else:
self.currMess["content"] += "</%s>"%tag
elif self.state == "onebox":
if tag == "div":
if self.divNest == 0:
self.state = ""
else:
self.divNest -= 1
self.currMess["content"] += "</div>"
else:
self.currMess["content"] += "</%s>"%tag
def handle_data(self, data):
if self.state == "content":
if 1 and self.debug: print(" data:",data)
if self.currMess["content"]:
self.currMess["content"] += data
else:
self.currMess["content"] = data[22:]
elif self.state == "onebox":
self.currMess["content"] += data
elif self.state == "get name":
self.state = ""
if not self.currUser: self.currUser = int(data.strip()[4:])
self.names[self.currUser] = data.strip()
elif self.state == "get time":
self.state = ""
self.timestamp = data.strip()
elif self.state == "get stars":
self.state = ""
data = data.strip()
self.currMess['stars'] = int(data) if data else 0
def parseConvos(roomNum=240, year=2016, month=3, day=23, hourStart=0, hourEnd=4, debug=0, log=0):
urlTemp = "http://chat.stackexchange.com/transcript/"+"{}/"*4+"{}-{}"
url = urlTemp.format(*[roomNum, year, month, day, hourStart, hourEnd])
if debug & 2: print(url)
text = ur.urlopen(url).read().decode('utf-8')
p = parser(debug=debug & 1)
p.feed(text)
users = {}
messNum = 0
messagesToCreate = []
for mid, message in p.messages.items():
if debug & 2: print("messNum, mid: %s, %s" % (messNum, mid))
messNum += 1
## print(mid, message)
uid = message['uid']
rid = message['rid']
name = message['name']
stars = message['stars']
onebox = message['onebox']
content = message['content']
timestamp = message['timestamp']
if uid not in users:
try:
user = User.objects.get(uid=uid)
except ObjectDoesNotExist:
user = User(uid=uid)
user.save()
users[uid] = {'user':user, 'names':[]}
names = users[uid]['names']
else:
user = users[uid]['user']
names = users[uid]['names']
if name not in names:
try:
username = Username.objects.get(user=user, name=name)
except ObjectDoesNotExist:
username = Username(name=name, user=user)
username.save()
names.append(name)
else:
pass
try:
message = Message.objects.get(mid=mid)
if message.content != content or message.name != name or message.stars != stars:
message.rid = rid
message.name = name
message.stars = stars
message.content = content
message.onebox = bool(onebox)
message.oneboxType = onebox
message.save()
except ObjectDoesNotExist:
date = datetime.date(year, month, day)
hourmin, half = timestamp.split(" ")
hour, minute = hourmin.split(":")
hour = int(hour)%12 + 12*(half=="PM")
minute = int(minute)
time = datetime.time(hour, minute)
message = Message(mid=mid, user=user, room=roomNum, date=date, time=time)
message.rid = rid
message.name = name
message.stars = stars
message.content = content
message.onebox = bool(onebox)
message.oneboxType = onebox
messagesToCreate.append(message)
Message.objects.bulk_create(messagesToCreate)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.