require 'net/http' | |
require 'uri' | |
require 'json' | |
require 'cgi' | |
@mutex = Mutex.new | |
def query(query) | |
uri = URI.parse("https://www.amazon.de/gp/search-inside/service-data") | |
http = Net::HTTP.new(uri.host, uri.port) | |
http.use_ssl = true | |
params = { | |
:method => "getSearchResults", | |
:asin => '0316310352', # product id | |
:buyingAsin => "0316310352", # product id | |
:query => query, | |
:pageSize => 20, | |
:pageNumber => 1 | |
} | |
header = {'Content-Type': ' application/x-www-form-urlencoded'} | |
request = Net::HTTP::Post.new(uri, header) | |
request.set_form_data(params) | |
begin | |
response = JSON.parse(http.request(request).body) | |
rescue | |
return query(query) | |
end | |
return response | |
end | |
fragments_file = File.read('fragments.json') | |
@fragments = JSON.parse(fragments_file) | |
@threads = [] | |
# initial keywords to start the crawl with | |
@keywords = ["anduin", "sylvanas", "undercity", "stormwind", "azeroth", "azerite", "arathi"] | |
def parse(response, query) | |
results = response["results"] | |
@mutex.synchronize { | |
@keywords << query | |
} | |
if results.nil? | |
return | |
end | |
results.each do |result| | |
page_id = result[0] | |
text = result[2] | |
# text.gsub! "<b>", "" | |
# text.gsub! "</b>", "" | |
text.gsub!(/<\/?[^>]*>/, "") | |
text.gsub! "\n", "" | |
text.gsub! "\t", "" | |
text = CGI.unescapeHTML(text) | |
words = text.split(/\W+/) | |
@words_to_use = words - @keywords | |
@semaphore.synchronize { | |
if @fragments[page_id].nil? | |
@fragments[page_id] = [] | |
end | |
if @fragments[page_id].include? text | |
return | |
end | |
@fragments[page_id].push text | |
File.open("fragments.json","w") do |f| | |
sorted = Hash[@fragments.sort_by{|k,v| Integer(k)}] | |
f.write(JSON.pretty_generate(sorted)) | |
end | |
} | |
local_threads = [] | |
@words_to_use.each { |word| | |
thr = Thread.new { | |
parse(query(word), word) | |
} | |
local_threads << thr | |
} | |
local_threads.each { |thr| thr.join } | |
end | |
end | |
@keywords.each do |query| | |
thr = Thread.new { | |
response = query(query) | |
parse(response, query) | |
} | |
@threads << thr | |
end | |
@threads.each { |thr| thr.join } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment