Skip to content

Instantly share code, notes, and snippets.

@quartata
Created August 21, 2016 23:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quartata/1300df4fd43c9cf0374e9548a2b6f209 to your computer and use it in GitHub Desktop.
Save quartata/1300df4fd43c9cf0374e9548a2b6f209 to your computer and use it in GitHub Desktop.
import datetime
import pickle
import transcriptAnalyzer_database
import urllib.request
class Message:
def __init__(self, message, replies):
self.content = message
self.replies = replies
# http://stackoverflow.com/a/1060376/4766556
def daterange(start, stop, step=datetime.timedelta(days=1), inclusive=False):
if step.days > 0:
while start < stop:
yield start
start = start + step
elif step.days < 0:
while start > stop:
yield start
start = start + step
if inclusive and start == stop:
yield start
def bk_insert(message, tree):
if not tree:
tree = [message, []]
else:
d = distance(message.content, tree[0].content)
if d == 0:
tree[0].replies.extend(message.replies)
return tree
if len(tree[1]) > d:
if tree[1][d - 1] == None:
tree[1][d - 1] = [message, []]
else:
tree[1][d - 1] = bk_insert(message, tree[1][d - 1])
else:
for i in range(len(tree[1]), d):
tree[1].insert(i, None)
tree[1][d - 1] = [message, []]
return tree
def distance(a, b):
distance = 0
len1 = len(a)
len2 = len(b)
for i in range(min(len1, len2)):
if a[i] != b[i]:
distance += 2
return distance + abs(len2 - len1)
def parseConvos(roomNum=240, startDate=datetime.date(2016, 8, 21)):
parser = transcriptAnalyzer_database.parser()
for date in daterange(startDate, datetime.date.today(), inclusive=True):
print("Retrieving: " + str(date))
urlTemp = "http://chat.stackexchange.com/transcript/"+"{}/"*4+"0-24"
url = urlTemp.format(*[roomNum, date.year, date.month, date.day])
text = urllib.request.urlopen(url).read().decode('utf-8')
parser.feed(text)
print("Assembling...")
messages = list(parser.messages.items())
length = len(messages)
tree = []
for i in range(length):
id, message = messages[i]
timestamp = datetime.datetime.strptime(message["timestamp"], "%I:%M %p")
replies = []
for j in range(i + 1, length):
next_message = messages[j][1]
if next_message["rid"] == id:
replies.append(next_message["content"])
elif next_message["rid"] == None:
if not (datetime.datetime.strptime(next_message["timestamp"], "%I:%M %p") - timestamp).seconds // 3600 and "@" + message["name"] in next_message["content"]:
replies.append(next_message["content"])
elif j == i + 1 and "@" not in next_message["content"]:
replies.append(next_message["content"])
tree = bk_insert(Message(message["content"], replies), tree)
return tree
if __name__ == "__main__":
pickle.dump(parseConvos(), open("knowledge_base.pck", "wb"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment