Instantly share code, notes, and snippets.

Embed
What would you like to do?
require 'net/http'
require 'uri'
require 'json'
require 'cgi'
@mutex = Mutex.new
def query(query)
uri = URI.parse("https://www.amazon.de/gp/search-inside/service-data")
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
params = {
:method => "getSearchResults",
:asin => '0316310352', # product id
:buyingAsin => "0316310352", # product id
:query => query,
:pageSize => 20,
:pageNumber => 1
}
header = {'Content-Type': ' application/x-www-form-urlencoded'}
request = Net::HTTP::Post.new(uri, header)
request.set_form_data(params)
begin
response = JSON.parse(http.request(request).body)
rescue
return query(query)
end
return response
end
fragments_file = File.read('fragments.json')
@fragments = JSON.parse(fragments_file)
@threads = []
# initial keywords to start the crawl with
@keywords = ["anduin", "sylvanas", "undercity", "stormwind", "azeroth", "azerite", "arathi"]
def parse(response, query)
results = response["results"]
@mutex.synchronize {
@keywords << query
}
if results.nil?
return
end
results.each do |result|
page_id = result[0]
text = result[2]
# text.gsub! "<b>", ""
# text.gsub! "</b>", ""
text.gsub!(/<\/?[^>]*>/, "")
text.gsub! "\n", ""
text.gsub! "\t", ""
text = CGI.unescapeHTML(text)
words = text.split(/\W+/)
@words_to_use = words - @keywords
@semaphore.synchronize {
if @fragments[page_id].nil?
@fragments[page_id] = []
end
if @fragments[page_id].include? text
return
end
@fragments[page_id].push text
File.open("fragments.json","w") do |f|
sorted = Hash[@fragments.sort_by{|k,v| Integer(k)}]
f.write(JSON.pretty_generate(sorted))
end
}
local_threads = []
@words_to_use.each { |word|
thr = Thread.new {
parse(query(word), word)
}
local_threads << thr
}
local_threads.each { |thr| thr.join }
end
end
@keywords.each do |query|
thr = Thread.new {
response = query(query)
parse(response, query)
}
@threads << thr
end
@threads.each { |thr| thr.join }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment