Skip to content

Instantly share code, notes, and snippets.

@TRManderson
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TRManderson/6849ab558d18906ede40 to your computer and use it in GitHub Desktop.
Save TRManderson/6849ab558d18906ede40 to your computer and use it in GitHub Desktop.
In a meta-post on Lesswrong, Metus suggested someone plot a curve of comments over time, so I did. (see here: http://lesswrong.com/r/discussion/lw/lev/short_meta_should_open_threads_be_more_frequent/)
import bs4
import datetime as dt
def parseDate(comment):
comment=comment.findChildren("div",class_="entry",recursive=False)[0]
commentmeta=comment.findChildren("div",class_="comment-meta",recursive=False)[0]
try:
commentdate=commentmeta.findChildren("span",class_="comment-date")[0].contents[0].strip()
except:
if "comment score below threshold" in commentmeta.text:
return None
else:
raise
return dt.datetime.strptime(commentdate,"%d %B %Y %I:%M:%S%p")
def dropNone(ls):
return [x for x in ls if x != None]
def pageDate(soup):
meta=soup.findAll("div",class_="meta")[0]
date=meta.findAll("span",class_="date")[0]
date=date.contents[0].strip()
return dt.datetime.strptime(date,"%d %B %Y %I:%M%p")
def parsePageToplevel(soup):
commentArea=soup.findAll("div",id="comments")[0].findChildren("div",class_="sitetable",recursive=False)[0]
comments=commentArea.findChildren("div",class_="comment",recursive=False)
commentDates = map(parseDate,comments)
pagedate=pageDate(soup)
return map(lambda x:(x-pagedate).total_seconds()/60,dropNone(commentDates))
def parsePageTotal(soup):
commentArea=soup.findAll("div",id="comments")[0].findChildren("div",class_="sitetable",recursive=False)[0]
comments=commentArea.findChildren("div",class_="comment",recursive=True)
commentDates = map(parseDate,comments)
pagedate=pageDate(soup)
return map(lambda x:(x-pagedate).total_seconds()/60,dropNone(commentDates))
import matplotlib.pyplot as plt
from pandas import Series
def plotSeries(timedeltas,multiplier=1,name=None):
timedeltas.sort()
s=Series([(i+1)*multiplier for i,v in enumerate(timedeltas)],timedeltas)
s.plot(label=name)
import mechanize
root="http://lesswrong.com/r/discussion/tag/open_thread/"
browser=mechanize.Browser()
browser.open(root)
listing=bs4.BeautifulSoup(browser.response())
n=20
links=[x for x in listing.findAll("h2") if x.get("itemprop")=="name"][:n]
links= map(lambda x:x.findChildren()[0]["href"],links)
links = map(lambda x:"http://lesswrong.com/"+x,links)
pages=map(lambda x:mechanize.urlopen(x).read(),links)
soups=map(lambda x:bs4.BeautifulSoup(x),pages)
totals=[]
tops=[]
map(lambda x:totals.extend(x),map(parsePageTotal,soups))
map(lambda x:tops.extend(x),map(parsePageToplevel,soups))
plt.figure(ylabel="Number of comments",xlabel="Time since thread created (minutes)")
plotSeries(tops,1./n,name="Top level comments")
plotSeries(totals,1./n,name="Total comments")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment