|
require 'rubygems' |
|
require 'nokogiri' |
|
require 'open-uri' |
|
|
|
#Change this to your username to run this script for you. |
|
#Make sure it makes the name in the 'lesswrong.com/user/USERNAME/comments/ url. |
|
username = "Will_Newsome" |
|
|
|
def parse_page url |
|
puts $pages |
|
$pages+=1 |
|
#We obtain the url for a page somehow, and want to pull out all of the comments from it. |
|
page = Nokogiri::HTML.parse(open(url)) |
|
temp_kar=[] |
|
temp_links=[] |
|
page.xpath('//span[@class="votes "]').children.each {|x| |
|
#This extracts the karma. We get each one twice, though. |
|
temp_kar.push(x.content[/-?\d+/].to_i) |
|
} |
|
page.xpath('//li[@class="permalink"]').children.each {|x| |
|
#This extracts the permalink. |
|
temp_links.push(x["href"]) |
|
} |
|
#Make a karma histogram. |
|
temp_links.each_index {|y| |
|
$karma[temp_kar[2*y]]+=1; |
|
$links.push([temp_kar[2*y],temp_links[y]]); |
|
} |
|
|
|
#Now, find the url of the next page, if it exists, and parse that page. |
|
link = page.xpath('//a[text()="Next"]')[0] |
|
$last = link["href"] if link |
|
parse_page link["href"] if link |
|
end |
|
|
|
$karma = Hash.new(0) |
|
$links = [] |
|
$pages = 0 |
|
$last = "" |
|
|
|
parse_page("http://lesswrong.com/user/#{username}/comments/") |
|
|
|
|
|
puts $last |
|
puts $pages |
|
|
|
#Now that we have a karma hash, output it. |
|
outfile=File.open("karma.tsv",'w') |
|
sk=$karma.sort |
|
sk.each{|pair| |
|
outfile.puts(pair[0].to_s+"\t"+pair[1].to_s)} |
|
outfile.close |
|
|
|
#Now that we have a hash of links, output it. |
|
outfile=File.open("links.tsv",'w') |
|
sl=$links.sort.reverse |
|
sl.each{|pair| |
|
outfile.puts(pair[0].to_s+"\t"+pair[1])} |
|
outfile.close |
|
|