| import bs4 | |
| import datetime as dt | |
| def parseDate(comment): | |
| comment=comment.findChildren("div",class_="entry",recursive=False)[0] | |
| commentmeta=comment.findChildren("div",class_="comment-meta",recursive=False)[0] | |
| try: | |
| commentdate=commentmeta.findChildren("span",class_="comment-date")[0].contents[0].strip() | |
| except: | |
| if "comment score below threshold" in commentmeta.text: | |
| return None | |
| else: | |
| raise | |
| return dt.datetime.strptime(commentdate,"%d %B %Y %I:%M:%S%p") | |
| def dropNone(ls): | |
| return [x for x in ls if x != None] | |
| def pageDate(soup): | |
| meta=soup.findAll("div",class_="meta")[0] | |
| date=meta.findAll("span",class_="date")[0] | |
| date=date.contents[0].strip() | |
| return dt.datetime.strptime(date,"%d %B %Y %I:%M%p") | |
| def parsePageToplevel(soup): | |
| commentArea=soup.findAll("div",id="comments")[0].findChildren("div",class_="sitetable",recursive=False)[0] | |
| comments=commentArea.findChildren("div",class_="comment",recursive=False) | |
| commentDates = map(parseDate,comments) | |
| pagedate=pageDate(soup) | |
| return map(lambda x:(x-pagedate).total_seconds()/60,dropNone(commentDates)) | |
| def parsePageTotal(soup): | |
| commentArea=soup.findAll("div",id="comments")[0].findChildren("div",class_="sitetable",recursive=False)[0] | |
| comments=commentArea.findChildren("div",class_="comment",recursive=True) | |
| commentDates = map(parseDate,comments) | |
| pagedate=pageDate(soup) | |
| return map(lambda x:(x-pagedate).total_seconds()/60,dropNone(commentDates)) | |
| import matplotlib.pyplot as plt | |
| from pandas import Series | |
| def plotSeries(timedeltas,multiplier=1,name=None): | |
| timedeltas.sort() | |
| s=Series([(i+1)*multiplier for i,v in enumerate(timedeltas)],timedeltas) | |
| s.plot(label=name) | |
| import mechanize | |
| root="http://lesswrong.com/r/discussion/tag/open_thread/" | |
| browser=mechanize.Browser() | |
| browser.open(root) | |
| listing=bs4.BeautifulSoup(browser.response()) | |
| n=20 | |
| links=[x for x in listing.findAll("h2") if x.get("itemprop")=="name"][:n] | |
| links= map(lambda x:x.findChildren()[0]["href"],links) | |
| links = map(lambda x:"http://lesswrong.com/"+x,links) | |
| pages=map(lambda x:mechanize.urlopen(x).read(),links) | |
| soups=map(lambda x:bs4.BeautifulSoup(x),pages) | |
| totals=[] | |
| tops=[] | |
| map(lambda x:totals.extend(x),map(parsePageTotal,soups)) | |
| map(lambda x:tops.extend(x),map(parsePageToplevel,soups)) | |
| plt.figure(ylabel="Number of comments",xlabel="Time since thread created (minutes)") | |
| plotSeries(tops,1./n,name="Top level comments") | |
| plotSeries(totals,1./n,name="Total comments") | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
