Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
require 'net/http'
require 'uri'
require 'json'
require 'cgi'
@mutex = Mutex.new
def query(query)
uri = URI.parse("https://www.amazon.de/gp/search-inside/service-data")
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
params = {
:method => "getSearchResults",
:asin => '0316310352', # product id
:buyingAsin => "0316310352", # product id
:query => query,
:pageSize => 20,
:pageNumber => 1
}
header = {'Content-Type': ' application/x-www-form-urlencoded'}
request = Net::HTTP::Post.new(uri, header)
request.set_form_data(params)
begin
response = JSON.parse(http.request(request).body)
rescue
return query(query)
end
return response
end
fragments_file = File.read('fragments.json')
@fragments = JSON.parse(fragments_file)
@threads = []
# initial keywords to start the crawl with
@keywords = ["anduin", "sylvanas", "undercity", "stormwind", "azeroth", "azerite", "arathi"]
def parse(response, query)
results = response["results"]
@mutex.synchronize {
@keywords << query
}
if results.nil?
return
end
results.each do |result|
page_id = result[0]
text = result[2]
# text.gsub! "<b>", ""
# text.gsub! "</b>", ""
text.gsub!(/<\/?[^>]*>/, "")
text.gsub! "\n", ""
text.gsub! "\t", ""
text = CGI.unescapeHTML(text)
words = text.split(/\W+/)
@words_to_use = words - @keywords
@semaphore.synchronize {
if @fragments[page_id].nil?
@fragments[page_id] = []
end
if @fragments[page_id].include? text
return
end
@fragments[page_id].push text
File.open("fragments.json","w") do |f|
sorted = Hash[@fragments.sort_by{|k,v| Integer(k)}]
f.write(JSON.pretty_generate(sorted))
end
}
local_threads = []
@words_to_use.each { |word|
thr = Thread.new {
parse(query(word), word)
}
local_threads << thr
}
local_threads.each { |thr| thr.join }
end
end
@keywords.each do |query|
thr = Thread.new {
response = query(query)
parse(response, query)
}
@threads << thr
end
@threads.each { |thr| thr.join }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.