Skip to content

Instantly share code, notes, and snippets.

@johnnysparks
Last active Dec 16, 2015
Embed
What would you like to do?
Steam Profile Name Scrapper
require 'open-uri'
require 'nokogiri'
require 'cgi'
# UserSeeder class has one important method -> get_user
# all forum and discussion pagination should be automagic
class UserSeeder
def initialize
@forum = -1 # only access to new forum page increments before fetching, staring at -1
@user_buffer = [] # user buffer holds the list of profile names
@topic_buffer = [] # holds a list of topics scrapped from each forum page
@forum_page = 0 # page index for a forum
@forum_pages = 0 # holds the number of pages for the current forum
@forum_page_size = 50 # max discussions per forum that Steam allows for a request
@forum_contents = "" # unparsed forum contents
# ajax endpoint that returns json plus HTML for page results
@forum_url = "http://steamcommunity.com/forum/4009259/General/render"
# html endpoint for first page of discussion forum
@discussion_url = "http://steamcommunity.com/discussions/forum/"
# chrome user agent
@user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22"
end
# next_forum increments the forum walks through a small int forum
# number until it finds one with a list of valid discussion topics
# it gives up after 10 iterations by default
def next_forum( attempts = 10 )
@forum = @forum.next
unless attempts
abort("cannot find a new forum")
end
forum_content = open("#{@forum_url}/#{@forum}/?start=0&count=#{@forum_page_size}", "User-Agent" => @user_agent)
sleep(2)
@forum_contents = forum_content.read
@forum_pages = self.get_forum_page_count
if @forum_pages > 0
@forum_page = 0
else
attempts -= 1
self.next_forum( attempts )
end
end
# walk through all the "pages" of a discussion forum
# actually just a start point and a count
# if it's on the last page it will start ask for the next forum
def next_forum_page
if @forum_page < @forum_pages
start = @forum_page * @forum_page_size
forum_content = open("#{@forum_url}/#{@forum}/?start=#{start}&count=#{@forum_page_size}", "User-Agent" => @user_agent)
sleep(2)
@forum_contents = forum_content.read
@forum_page = @forum_page.next
else
self.next_forum
end
end
# filling the topic buffer parses the forum page for discussion ids. If
# it doesn't find any, it walks to the next page in forum and tries again.
def fill_topic_buffer
page_topics = @forum_contents.scan(/steamcommunity.com\\\/discussions\\\/forum\\\/[0-9]+\\\/([0-9]+)\\\//).flatten
if page_topics.length > 0
@topic_buffer.concat( page_topics )
else
self.next_forum_page
self.fill_topic_buffer
end
end
# get_user pulls a user from the buffer. If the buffer is empty
# it calls the method to fill it and tries again.
def get_user
if @user_buffer.length > 0
return @user_buffer.shift
else
self.fill_user_buffer
return self.get_user
end
end
# The user buffer is filled by scraping a forum discussion topic
# for all commentors. If the topic buffer is empty, refill it.
def fill_user_buffer
if @topic_buffer.length > 0
self.users_from_topic( @forum, @topic_buffer.shift )
else
self.fill_topic_buffer
self.users_from_topic( @forum, @topic_buffer.shift )
end
end
# users_from_topic opens a discussion thread and pulls out
# commentor and poster's usernames.
def users_from_topic( forum, topic )
content = open("#{@discussion_url}/#{forum}/#{topic}/", "User-Agent" => @user_agent)
sleep(2)
doc = Nokogiri::HTML( content )
doc.css('.commentthread_author_link, .forum_op_author').each do |link|
@user_buffer << link.attribute('href').content.split('/').last
end
end
# the forum page count is calculated by grabbig the total number
# of discussions in the forum, and dividing by the "page" size
# (just the max results per request)
def get_forum_page_count
pages = 0
if @forum_contents.length > 10
if match = @forum_contents.match(/"total_count"\s?:\s?(null|[0-9]+)/i)
count, null = match.captures
pages = count.to_i / @forum_page_size.to_f
pages = pages.ceil
end
end
return pages
end
end
#
# Executing script (main)
#
di = UserSeeder.new
while true
puts di.get_user
$stdout.flush
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment