Created
December 5, 2014 02:12
-
-
Save anonymous/753b15953816ee0ef6e0 to your computer and use it in GitHub Desktop.
possible memory leak?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
require 'net/http' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'json' | |
require 'sequel' | |
class String | |
def string_between_markers marker1, marker2 | |
self[/#{Regexp.escape(marker1)}(.*?)#{Regexp.escape(marker2)}/m, 1] | |
end | |
end | |
require '/Users/elemenopy/Documents/WTBWTSRubyScripts/WebScrapeClass.rb' | |
def open(url) | |
Net::HTTP.get(URI.parse(url)) | |
end | |
def iscurrentthread thread, postdate | |
db = Sequel.connect(:adapter => 'mysql', :user => 'root', :host => 'localhost', :database => 'wtbwtspoe',:password=>'') | |
countofitems = db[:ThreadTable].where(:ThreadURL=>thread, :PostDate=>postdate).count | |
db.disconnect | |
if countofitems > 0 | |
return true | |
else | |
return false | |
end | |
end | |
def runrootlevelscrape (rootlevelstring,region) | |
i = 1 | |
lastpage = 21 | |
while i < lastpage do | |
threadurl = rootlevelstring + i.to_s | |
puts threadurl | |
page_content = open(threadurl) | |
doc = Nokogiri::HTML(page_content) | |
htmlmastertable = doc.xpath("//table[@class='forumTable viewForumTable']") | |
htmlmastertable.xpath("//tr").each_with_index do |trrow,index| | |
currentindex = "#{index}" | |
anchorselect = trrow.xpath("//tr[" + currentindex + "]" + "/td[@class='thread']/div[@class='thread_title']/div[@class='title']/a") | |
postdate = trrow.xpath("//tr[" + currentindex + "]" + "/td[@class='last_post']/span[@class='post_date']") | |
posttitle = anchorselect.text | |
posturl = "" | |
trrow.xpath("//tr[" + currentindex + "]" + "/td[@class='thread']/div[@class='thread_title']/div[@class='title']/a[@href]").each do |link| | |
posturl = link['href'] | |
end | |
if region == "Garena" | |
posturl = "http://web.poe.garena.com/" + posturl | |
else | |
posturl = "http://www.pathofexile.com" + posturl | |
end | |
postdate = postdate.inner_html.strip | |
if postdate != "" | |
iscurrent = iscurrentthread(posturl,postdate) | |
if iscurrent == true | |
puts "Skipping this thread we already have it" | |
else | |
puts "New Thread Detected Scraping Now" | |
page = WebPage.new() | |
page.scrapepage(posturl,rootlevelstring,i.to_s,region,postdate) | |
end | |
end | |
end | |
i+=1 | |
end | |
end | |
loop do | |
t1=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/standard-trading-shops/page/","Original")} | |
t2=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/standard-trading-selling/page/","Original")} | |
t3=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/hardcore-trading-shops/page/","Original")} | |
t4=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/hardcore-trading-selling/page/","Original")} | |
#t5=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/548/page/","Original")} | |
#t6=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/550/page/","Original")} | |
##t7=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/549/page/","Original")} | |
#t8=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/551/page/","Original")} | |
#t9=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/552/page/","Original")} | |
#t10=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/553/page/","Original")} | |
t11=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/standard-shops/page/","Garena")} | |
t12=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/standard-sell/page/","Garena")} | |
t13=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/hardcore-shops/page/","Garena")} | |
t14=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/hardcore-sell/page/","Garena")} | |
#t15=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/beyond-selling/page/","Garena")} | |
#t16=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/beyond-shops/page/","Garena")} | |
#t17=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/rampage-shops/page/","Garena")} | |
#t18=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/rampage-sell/page/","Garena")} | |
t1.join | |
t2.join | |
t3.join | |
t4.join | |
#t5.join | |
#t6.join | |
#t7.join | |
#t8.join | |
#t9.join | |
#t10.join | |
t11.join | |
t12.join | |
t13.join | |
t14.join | |
#t15.join | |
#t16.join | |
#t17.join | |
#t18.join | |
Thread.kill(t1) | |
Thread.kill(t2) | |
Thread.kill(t3) | |
Thread.kill(t4) | |
Thread.kill(t11) | |
Thread.kill(t12) | |
Thread.kill(t13) | |
Thread.kill(t14) | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class WebPage | |
def initialize() | |
end | |
class Nokogiri::XML::Document | |
def remove_empty_lines! | |
self.xpath("//text()").each { |text| text.content = text.content.gsub(/\n(\s*\n)+/,"\n") }; self | |
end | |
end | |
def open(url) | |
Net::HTTP.get(URI.parse(url)) | |
end | |
def scrapepage(threadurl,rootlevelstring,pagenumber,region,postdate) | |
puts "Running Root Level of: " + rootlevelstring + " on page number: " + pagenumber.to_s + " and forum URL of: " + threadurl | |
require 'net/http' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'json' | |
require 'sequel' | |
histogram = Hash.new(0) | |
page_content = open(threadurl) | |
doc = Nokogiri::HTML(page_content) | |
unless doc.xpath('//table[@class="forumTable forumPostListTable"]/tr/td/div[@class="content"]').first.nil? | |
firstpostonpage = doc.xpath('//table[@class="forumTable forumPostListTable"]/tr/td/div[@class="content"]').first | |
firstpostonpage.xpath('//br').find_all.each do |br| | |
br.remove | |
end | |
postedbyblock = doc.xpath('//div[@class="posted-by"]') | |
twitchfull = "" | |
twitchusername = "" | |
if postedbyblock.xpath('//a[@class="twitch"]') != nil | |
twitchfull = postedbyblock.xpath('//a[@class="twitch"]/@href') | |
twitchfullurllength = twitchfull.to_s.length | |
twitchusername = twitchfull.to_s[21..-1] | |
if twitchfullurllength > 50 | |
twitchusername = "" | |
else | |
puts "added twitch user" | |
end | |
end | |
username = "" | |
if doc.xpath('//span[@class="profile-link post_by_account"]').first != nil | |
spanlink = spanlink = doc.xpath('//span[@class="profile-link post_by_account"]').first | |
username = spanlink.text | |
end | |
buyouts = Array.new | |
firstpostonpage.xpath('//div[starts-with(@id, "item-frag")]').each { |x| | |
rowitem = Array.new([x['id'],x.next]) | |
buyouts.push(rowitem) | |
} | |
buyouts.each_with_index { | |
|x,index| | |
currentindex = "#{index}" | |
} | |
str1_markerstring = "function(R) { (new R(" | |
str2_markerstring = ")).run(); });" | |
middletext = page_content.string_between_markers(str1_markerstring,str2_markerstring) | |
if middletext != nil | |
jsonvariable_hash = JSON.parse(middletext) | |
db = Sequel.connect(:adapter => 'mysql', :user => 'root', :host => 'localhost', :database => 'wtbwtspoe',:password=>'') | |
threadcollection = db[:ThreadTable] | |
db[:ItemTable].where(:ThreadURL=>threadurl).delete | |
threadcollection.where(:ThreadURL=>threadurl).delete | |
threadcollection.insert(:ThreadURL=>threadurl,:PostDate=>postdate) | |
jsonvariable_hash.each_with_index do |child,index| | |
itemsindex = "#{index}" | |
verified = child[1]["verified"] | |
imageurl = child[1]["icon"] | |
support = child[1]["support"] | |
league = child[1]["league"] | |
itemname = child[1]["name"] | |
typeline = child[1]["typeLine"] | |
identified = child[1]["identified"] | |
corrupted = child[1]["corrupted"] | |
implicitmods = child[1]["implicitMods"] | |
explicitmodhash = child[1]["explicitMods"] | |
flavourtext = child[1]["flavourtext"] | |
buyout = buyouts[itemsindex.to_i][1].to_s | |
buyout.to_s.gsub("\n","") | |
if buyout =~ /^<div class/ | |
buyout = "" | |
end | |
descrText = child[1]["descrText"].to_s | |
if support==false | |
itemname = typeline | |
end | |
if descrText =~ /^This is a Support Gem/ | |
itemname = typeline | |
end | |
if region == "Garena" | |
imageurl = "http://webcdn.pathofexile.com"+ imageurl | |
end | |
items = db[:ItemTable] | |
returnedid = items.insert(:Username=>'NA',:ThreadURL=>threadurl,:Verified=>verified,:IconURL=>imageurl,:League=>league, | |
:ItemName=>itemname,:ItemType=>typeline,:Identified=>identified,:Corrupted=>corrupted, | |
:FlavourText=>flavourtext,:Sockets=>'',:Properties=>'',:Requirements=>1,:Buyout=>buyout,:Region=>region,:ImplicitMod=>implicitmods,:twitchusername=>twitchusername) | |
if explicitmodhash != nil | |
explicitmodhash.each do |expmod| | |
values = expmod.split(" ",2) | |
modvalue = values[0] | |
modtype = values[1] | |
explicitmods = db[:ExplicitModTable] | |
explicitmods.insert(:ItemTableID=>returnedid,:ModValue=>modvalue,:ModType=>modtype) | |
end | |
end | |
db.disconnect | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment