Steam Profile Name Scrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'cgi' | |
# UserSeeder class has one important method -> get_user | |
# all forum and discussion pagination should be automagic | |
class UserSeeder | |
def initialize | |
@forum = -1 # only access to new forum page increments before fetching, staring at -1 | |
@user_buffer = [] # user buffer holds the list of profile names | |
@topic_buffer = [] # holds a list of topics scrapped from each forum page | |
@forum_page = 0 # page index for a forum | |
@forum_pages = 0 # holds the number of pages for the current forum | |
@forum_page_size = 50 # max discussions per forum that Steam allows for a request | |
@forum_contents = "" # unparsed forum contents | |
# ajax endpoint that returns json plus HTML for page results | |
@forum_url = "http://steamcommunity.com/forum/4009259/General/render" | |
# html endpoint for first page of discussion forum | |
@discussion_url = "http://steamcommunity.com/discussions/forum/" | |
# chrome user agent | |
@user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22" | |
end | |
# next_forum increments the forum walks through a small int forum | |
# number until it finds one with a list of valid discussion topics | |
# it gives up after 10 iterations by default | |
def next_forum( attempts = 10 ) | |
@forum = @forum.next | |
unless attempts | |
abort("cannot find a new forum") | |
end | |
forum_content = open("#{@forum_url}/#{@forum}/?start=0&count=#{@forum_page_size}", "User-Agent" => @user_agent) | |
sleep(2) | |
@forum_contents = forum_content.read | |
@forum_pages = self.get_forum_page_count | |
if @forum_pages > 0 | |
@forum_page = 0 | |
else | |
attempts -= 1 | |
self.next_forum( attempts ) | |
end | |
end | |
# walk through all the "pages" of a discussion forum | |
# actually just a start point and a count | |
# if it's on the last page it will start ask for the next forum | |
def next_forum_page | |
if @forum_page < @forum_pages | |
start = @forum_page * @forum_page_size | |
forum_content = open("#{@forum_url}/#{@forum}/?start=#{start}&count=#{@forum_page_size}", "User-Agent" => @user_agent) | |
sleep(2) | |
@forum_contents = forum_content.read | |
@forum_page = @forum_page.next | |
else | |
self.next_forum | |
end | |
end | |
# filling the topic buffer parses the forum page for discussion ids. If | |
# it doesn't find any, it walks to the next page in forum and tries again. | |
def fill_topic_buffer | |
page_topics = @forum_contents.scan(/steamcommunity.com\\\/discussions\\\/forum\\\/[0-9]+\\\/([0-9]+)\\\//).flatten | |
if page_topics.length > 0 | |
@topic_buffer.concat( page_topics ) | |
else | |
self.next_forum_page | |
self.fill_topic_buffer | |
end | |
end | |
# get_user pulls a user from the buffer. If the buffer is empty | |
# it calls the method to fill it and tries again. | |
def get_user | |
if @user_buffer.length > 0 | |
return @user_buffer.shift | |
else | |
self.fill_user_buffer | |
return self.get_user | |
end | |
end | |
# The user buffer is filled by scraping a forum discussion topic | |
# for all commentors. If the topic buffer is empty, refill it. | |
def fill_user_buffer | |
if @topic_buffer.length > 0 | |
self.users_from_topic( @forum, @topic_buffer.shift ) | |
else | |
self.fill_topic_buffer | |
self.users_from_topic( @forum, @topic_buffer.shift ) | |
end | |
end | |
# users_from_topic opens a discussion thread and pulls out | |
# commentor and poster's usernames. | |
def users_from_topic( forum, topic ) | |
content = open("#{@discussion_url}/#{forum}/#{topic}/", "User-Agent" => @user_agent) | |
sleep(2) | |
doc = Nokogiri::HTML( content ) | |
doc.css('.commentthread_author_link, .forum_op_author').each do |link| | |
@user_buffer << link.attribute('href').content.split('/').last | |
end | |
end | |
# the forum page count is calculated by grabbig the total number | |
# of discussions in the forum, and dividing by the "page" size | |
# (just the max results per request) | |
def get_forum_page_count | |
pages = 0 | |
if @forum_contents.length > 10 | |
if match = @forum_contents.match(/"total_count"\s?:\s?(null|[0-9]+)/i) | |
count, null = match.captures | |
pages = count.to_i / @forum_page_size.to_f | |
pages = pages.ceil | |
end | |
end | |
return pages | |
end | |
end | |
# | |
# Executing script (main) | |
# | |
di = UserSeeder.new | |
while true | |
puts di.get_user | |
$stdout.flush | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment