Created
December 31, 2018 15:48
-
-
Save justMaku/7cd30695a8687e2aa471f1035a0339bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'uri' | |
require 'json' | |
require 'cgi' | |
@mutex = Mutex.new | |
def query(query) | |
uri = URI.parse("https://www.amazon.de/gp/search-inside/service-data") | |
http = Net::HTTP.new(uri.host, uri.port) | |
http.use_ssl = true | |
params = { | |
:method => "getSearchResults", | |
:asin => '0316310352', # product id | |
:buyingAsin => "0316310352", # product id | |
:query => query, | |
:pageSize => 20, | |
:pageNumber => 1 | |
} | |
header = {'Content-Type': ' application/x-www-form-urlencoded'} | |
request = Net::HTTP::Post.new(uri, header) | |
request.set_form_data(params) | |
begin | |
response = JSON.parse(http.request(request).body) | |
rescue | |
return query(query) | |
end | |
return response | |
end | |
fragments_file = File.read('fragments.json') | |
@fragments = JSON.parse(fragments_file) | |
@threads = [] | |
# initial keywords to start the crawl with | |
@keywords = ["anduin", "sylvanas", "undercity", "stormwind", "azeroth", "azerite", "arathi"] | |
def parse(response, query) | |
results = response["results"] | |
@mutex.synchronize { | |
@keywords << query | |
} | |
if results.nil? | |
return | |
end | |
results.each do |result| | |
page_id = result[0] | |
text = result[2] | |
# text.gsub! "<b>", "" | |
# text.gsub! "</b>", "" | |
text.gsub!(/<\/?[^>]*>/, "") | |
text.gsub! "\n", "" | |
text.gsub! "\t", "" | |
text = CGI.unescapeHTML(text) | |
words = text.split(/\W+/) | |
@words_to_use = words - @keywords | |
@semaphore.synchronize { | |
if @fragments[page_id].nil? | |
@fragments[page_id] = [] | |
end | |
if @fragments[page_id].include? text | |
return | |
end | |
@fragments[page_id].push text | |
File.open("fragments.json","w") do |f| | |
sorted = Hash[@fragments.sort_by{|k,v| Integer(k)}] | |
f.write(JSON.pretty_generate(sorted)) | |
end | |
} | |
local_threads = [] | |
@words_to_use.each { |word| | |
thr = Thread.new { | |
parse(query(word), word) | |
} | |
local_threads << thr | |
} | |
local_threads.each { |thr| thr.join } | |
end | |
end | |
@keywords.each do |query| | |
thr = Thread.new { | |
response = query(query) | |
parse(response, query) | |
} | |
@threads << thr | |
end | |
@threads.each { |thr| thr.join } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment