Skip to content

Instantly share code, notes, and snippets.

Created December 5, 2014 02:12
Show Gist options
  • Save anonymous/753b15953816ee0ef6e0 to your computer and use it in GitHub Desktop.
Save anonymous/753b15953816ee0ef6e0 to your computer and use it in GitHub Desktop.
possible memory leak?
#!/usr/local/bin/ruby
require 'net/http'
require 'open-uri'
require 'nokogiri'
require 'json'
require 'sequel'
class String
def string_between_markers marker1, marker2
self[/#{Regexp.escape(marker1)}(.*?)#{Regexp.escape(marker2)}/m, 1]
end
end
require '/Users/elemenopy/Documents/WTBWTSRubyScripts/WebScrapeClass.rb'
def open(url)
Net::HTTP.get(URI.parse(url))
end
def iscurrentthread thread, postdate
db = Sequel.connect(:adapter => 'mysql', :user => 'root', :host => 'localhost', :database => 'wtbwtspoe',:password=>'')
countofitems = db[:ThreadTable].where(:ThreadURL=>thread, :PostDate=>postdate).count
db.disconnect
if countofitems > 0
return true
else
return false
end
end
def runrootlevelscrape (rootlevelstring,region)
i = 1
lastpage = 21
while i < lastpage do
threadurl = rootlevelstring + i.to_s
puts threadurl
page_content = open(threadurl)
doc = Nokogiri::HTML(page_content)
htmlmastertable = doc.xpath("//table[@class='forumTable viewForumTable']")
htmlmastertable.xpath("//tr").each_with_index do |trrow,index|
currentindex = "#{index}"
anchorselect = trrow.xpath("//tr[" + currentindex + "]" + "/td[@class='thread']/div[@class='thread_title']/div[@class='title']/a")
postdate = trrow.xpath("//tr[" + currentindex + "]" + "/td[@class='last_post']/span[@class='post_date']")
posttitle = anchorselect.text
posturl = ""
trrow.xpath("//tr[" + currentindex + "]" + "/td[@class='thread']/div[@class='thread_title']/div[@class='title']/a[@href]").each do |link|
posturl = link['href']
end
if region == "Garena"
posturl = "http://web.poe.garena.com/" + posturl
else
posturl = "http://www.pathofexile.com" + posturl
end
postdate = postdate.inner_html.strip
if postdate != ""
iscurrent = iscurrentthread(posturl,postdate)
if iscurrent == true
puts "Skipping this thread we already have it"
else
puts "New Thread Detected Scraping Now"
page = WebPage.new()
page.scrapepage(posturl,rootlevelstring,i.to_s,region,postdate)
end
end
end
i+=1
end
end
loop do
t1=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/standard-trading-shops/page/","Original")}
t2=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/standard-trading-selling/page/","Original")}
t3=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/hardcore-trading-shops/page/","Original")}
t4=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/hardcore-trading-selling/page/","Original")}
#t5=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/548/page/","Original")}
#t6=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/550/page/","Original")}
##t7=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/549/page/","Original")}
#t8=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/551/page/","Original")}
#t9=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/552/page/","Original")}
#t10=Thread.new{runrootlevelscrape("http://www.pathofexile.com/forum/view-forum/553/page/","Original")}
t11=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/standard-shops/page/","Garena")}
t12=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/standard-sell/page/","Garena")}
t13=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/hardcore-shops/page/","Garena")}
t14=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/hardcore-sell/page/","Garena")}
#t15=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/beyond-selling/page/","Garena")}
#t16=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/beyond-shops/page/","Garena")}
#t17=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/rampage-shops/page/","Garena")}
#t18=Thread.new{runrootlevelscrape("http://web.poe.garena.com/forum/view-forum/rampage-sell/page/","Garena")}
t1.join
t2.join
t3.join
t4.join
#t5.join
#t6.join
#t7.join
#t8.join
#t9.join
#t10.join
t11.join
t12.join
t13.join
t14.join
#t15.join
#t16.join
#t17.join
#t18.join
Thread.kill(t1)
Thread.kill(t2)
Thread.kill(t3)
Thread.kill(t4)
Thread.kill(t11)
Thread.kill(t12)
Thread.kill(t13)
Thread.kill(t14)
end
class WebPage
def initialize()
end
class Nokogiri::XML::Document
def remove_empty_lines!
self.xpath("//text()").each { |text| text.content = text.content.gsub(/\n(\s*\n)+/,"\n") }; self
end
end
def open(url)
Net::HTTP.get(URI.parse(url))
end
def scrapepage(threadurl,rootlevelstring,pagenumber,region,postdate)
puts "Running Root Level of: " + rootlevelstring + " on page number: " + pagenumber.to_s + " and forum URL of: " + threadurl
require 'net/http'
require 'open-uri'
require 'nokogiri'
require 'json'
require 'sequel'
histogram = Hash.new(0)
page_content = open(threadurl)
doc = Nokogiri::HTML(page_content)
unless doc.xpath('//table[@class="forumTable forumPostListTable"]/tr/td/div[@class="content"]').first.nil?
firstpostonpage = doc.xpath('//table[@class="forumTable forumPostListTable"]/tr/td/div[@class="content"]').first
firstpostonpage.xpath('//br').find_all.each do |br|
br.remove
end
postedbyblock = doc.xpath('//div[@class="posted-by"]')
twitchfull = ""
twitchusername = ""
if postedbyblock.xpath('//a[@class="twitch"]') != nil
twitchfull = postedbyblock.xpath('//a[@class="twitch"]/@href')
twitchfullurllength = twitchfull.to_s.length
twitchusername = twitchfull.to_s[21..-1]
if twitchfullurllength > 50
twitchusername = ""
else
puts "added twitch user"
end
end
username = ""
if doc.xpath('//span[@class="profile-link post_by_account"]').first != nil
spanlink = spanlink = doc.xpath('//span[@class="profile-link post_by_account"]').first
username = spanlink.text
end
buyouts = Array.new
firstpostonpage.xpath('//div[starts-with(@id, "item-frag")]').each { |x|
rowitem = Array.new([x['id'],x.next])
buyouts.push(rowitem)
}
buyouts.each_with_index {
|x,index|
currentindex = "#{index}"
}
str1_markerstring = "function(R) { (new R("
str2_markerstring = ")).run(); });"
middletext = page_content.string_between_markers(str1_markerstring,str2_markerstring)
if middletext != nil
jsonvariable_hash = JSON.parse(middletext)
db = Sequel.connect(:adapter => 'mysql', :user => 'root', :host => 'localhost', :database => 'wtbwtspoe',:password=>'')
threadcollection = db[:ThreadTable]
db[:ItemTable].where(:ThreadURL=>threadurl).delete
threadcollection.where(:ThreadURL=>threadurl).delete
threadcollection.insert(:ThreadURL=>threadurl,:PostDate=>postdate)
jsonvariable_hash.each_with_index do |child,index|
itemsindex = "#{index}"
verified = child[1]["verified"]
imageurl = child[1]["icon"]
support = child[1]["support"]
league = child[1]["league"]
itemname = child[1]["name"]
typeline = child[1]["typeLine"]
identified = child[1]["identified"]
corrupted = child[1]["corrupted"]
implicitmods = child[1]["implicitMods"]
explicitmodhash = child[1]["explicitMods"]
flavourtext = child[1]["flavourtext"]
buyout = buyouts[itemsindex.to_i][1].to_s
buyout.to_s.gsub("\n","")
if buyout =~ /^<div class/
buyout = ""
end
descrText = child[1]["descrText"].to_s
if support==false
itemname = typeline
end
if descrText =~ /^This is a Support Gem/
itemname = typeline
end
if region == "Garena"
imageurl = "http://webcdn.pathofexile.com"+ imageurl
end
items = db[:ItemTable]
returnedid = items.insert(:Username=>'NA',:ThreadURL=>threadurl,:Verified=>verified,:IconURL=>imageurl,:League=>league,
:ItemName=>itemname,:ItemType=>typeline,:Identified=>identified,:Corrupted=>corrupted,
:FlavourText=>flavourtext,:Sockets=>'',:Properties=>'',:Requirements=>1,:Buyout=>buyout,:Region=>region,:ImplicitMod=>implicitmods,:twitchusername=>twitchusername)
if explicitmodhash != nil
explicitmodhash.each do |expmod|
values = expmod.split(" ",2)
modvalue = values[0]
modtype = values[1]
explicitmods = db[:ExplicitModTable]
explicitmods.insert(:ItemTableID=>returnedid,:ModValue=>modvalue,:ModType=>modtype)
end
end
db.disconnect
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment