Last active
May 8, 2016 17:25
-
-
Save vladfaust/a597c99d551413676b95bc53200bb1b4 to your computer and use it in GitHub Desktop.
Small code snippets to grab from Aliexpress
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'active_support' | |
require 'fileutils' | |
module Proxy | |
@@proxy = nil | |
def self.get_proxy | |
@@proxy | |
end | |
def self.set_proxy(proxy) | |
@@proxy = proxy | |
end | |
def self.get_proxy_list | |
uri = 'http://hideme.ru/proxy-list/?maxtime=1000&type=h#list' | |
doc = Nokogiri::HTML(open(uri)) | |
list = [] | |
proxies = doc.css('.proxy__t tbody tr') | |
proxies.each do |proxy| | |
list << { ip: proxy.css('td')[0].text, port: proxy.css('td')[1].text.to_i } | |
end | |
list | |
end | |
end | |
module AliParse | |
def self.clear_url(url) | |
new_url = URI.unescape(url).match(/([^?#\s$]+)/)[0] | |
raise('Invalid URL') unless new_url[/^(https?:\/\/)?(\w*.)?aliexpress.com\/(item\/|store\/product\/)/] | |
new_url | |
end | |
def self.remove_int(url) | |
url.gsub(url.match(/(\w+\.)aliexpress\.com/)[1], '') | |
end | |
def self.open_url(url) | |
Nokogiri::HTML(open(url, proxy: Proxy.get_proxy)) | |
end | |
def self.get_aliexpress_item_info (input_url) | |
url = clear_url(input_url) | |
currency = input_url.match(/currencyType=(\w+)[&|$]/)&.captures&.first || 'USD' | |
url = "#{ url }?expanded=true#{ ('¤cyType=' + currency) if currency }" | |
doc = open_url(url) | |
begin | |
breadcrumbs = doc.css('.ui-breadcrumb a') | |
breadcrumbs.shift | |
breadcrumbs.shift | |
category = breadcrumbs[0].attribute('href').value.match(/\/([\w\-]*)\.html/).captures[0] | |
subcategory = (breadcrumbs[1].attribute('href').value.match(/\/([\w\-]*)\.html/).captures[0] if breadcrumbs[1]) || nil | |
subsubcategory = (breadcrumbs[2].attribute('href').value.match(/\/([\w\-]*)\.html/).captures[0] if breadcrumbs[2]) || nil | |
rescue | |
# ignored | |
# TODO make stores breadcumbs parse | |
end | |
id = doc.css('#hid-product-id')[0]&.attribute('value')&.value | |
name = doc.css('img[data-role="thumb"]')[0]&.attribute('title')&.value | |
image_src = doc.css('img[data-role="thumb"]')[0]&.attribute('src')&.value | |
reviews = doc.css('[itemprop="reviewCount"]')[0]&.text&.to_i | |
rating = doc.css('[itemprop="ratingValue"]')[0]&.text&.to_f | |
high_price = doc.css('[itemprop="highPrice"]')[0]&.text&.to_f | |
low_price = doc.css('[itemprop="lowPrice"]')[0]&.text&.to_f | |
currency = doc.css('[itemprop="priceCurrency"]')[0]&.attribute('content')&.value | |
price = doc.css('[itemprop="price"]')[0]&.text&.to_f || (high_price + low_price) / 2.0 | |
store_number = doc.css('#j-store-header .s-alitalk a')[0]&.attribute('data-id2')&.value | |
store_name = doc.css('#j-store-header .shop-name a')[0]&.text | |
store_rank = doc.css('#j-store-header .store-rank span') # TODO: Requires JS | |
{ | |
id: id, | |
category: category, | |
subcategory: subcategory, | |
subsubcategory: subsubcategory, | |
name: name, | |
image_src: image_src, | |
price: price, | |
high_price: high_price, | |
low_price: low_price, | |
currency: currency, | |
url: url, | |
reviews: reviews, | |
rating: rating, | |
store_number: store_number, | |
store_name: store_name, | |
store_rank: store_rank | |
} | |
end | |
def self.download_item_images(input_url, main = true, description = false) | |
url = remove_int(clear_url(input_url)) | |
if main | |
_url = url | |
some_trash = _url.match(/\/(\d+_)\d+.html$/)&.[](1) | |
_url = _url.gsub(some_trash, '') if some_trash | |
_url = _url.gsub(url.match(/(item\/|store\/product\/)/)[1], 'item-img/') | |
doc = open_url(_url) | |
item_id = doc.css('#atc-product-id')[0]&.attribute('value').value | |
unless File.directory?(item_id) | |
FileUtils.mkdir_p(item_id) | |
end | |
images = doc.css('.image ul li a img') | |
images.each_with_index do |image, index| | |
File.open(File.join("#{ item_id }", "main_#{ index }.#{ image['src'].match(/\.(\w+)$/)[1] }"), 'wb') do |file| | |
# puts "Writing main_#{ index }.#{ image['src'].match(/\.(\w+)$/)[1] }..." | |
file.write(open(image['src']).read) | |
end | |
end | |
puts "Downloaded #{ images.length } main images to /#{ item_id }" | |
end | |
if description | |
doc = open_url(url) | |
item_id = doc.css('#hid-product-id')[0]&.attribute('value')&.value | |
unless File.directory?(item_id) | |
FileUtils.mkdir_p(item_id) | |
end | |
description_url = "http://desc.aliexpress.com/getDescModuleAjax.htm?productId=#{ item_id }" | |
description = open_url(description_url) | |
images = description.css('img') | |
images.each_with_index do |image, index| | |
extension = image['src'].match(/\.(jpg|png|gif|jpeg)/)[1] | |
filename = "description_#{ index }.#{ extension }" | |
File.open(File.join("#{ item_id }", filename), 'wb') do |file| | |
# puts "Writing #{ filename }" | |
file.write(open(image['src']).read) | |
end | |
end | |
puts "Downloaded #{ images.length } description images to /#{ item_id }" | |
end | |
true | |
end | |
end | |
def get_random_proxy | |
sleep 0.5 | |
proxy = Proxy.get_proxy_list.sample | |
Proxy.set_proxy(URI.parse("http://#{ proxy[:ip] }:#{ proxy[:host] }")) | |
# puts "Trying proxy #{ Proxy.get_proxy }..." | |
end | |
### | |
puts "Hello!" | |
filename = "parsed_#{ Time.now.to_i }.txt" | |
output = File.open(filename, 'w') | |
get_random_proxy | |
ARGV.each do |url| | |
puts "Processing #{ url }..." | |
loop do | |
begin | |
item = AliParse.get_aliexpress_item_info(url) | |
puts "Parsed item ##{ item[:id] }!" | |
rescue Exception => e | |
# puts e.message | |
get_random_proxy | |
next | |
end | |
# ID | |
output.print "#{ item[:id] } " | |
# Image | |
output.print "=IMAGE(\"#{ item[:image_src] }\") " | |
# Price | |
output.print "#{ ActiveSupport::NumberHelper.number_to_rounded(item[:price], precision: 2, significant: false, separator: ',') } " | |
# Rating / Reviews | |
output.print "=HYPERLINK(\"#{ item[:url] }\"; \"#{ ActiveSupport::NumberHelper.number_to_rounded(item[:rating], precision: 2, significant: false, separator: ',') } / #{ item[:reviews] }\") " | |
# Store | |
output.print "=HYPERLINK(\"aliexpress.com/store/#{ item[:store_number] }\"; \"#{ item[:store_name] }\") " | |
# Category | |
output.print "#{ item[:category] } " | |
# Subcategory | |
output.print "#{ item[:subcategory] } " | |
# Subsubcategory | |
output.print "#{ item[:subsubcategory] } " | |
output.print "\n" | |
break | |
end | |
loop do | |
begin | |
AliParse.download_item_images(url, true, false) | |
break | |
rescue Exception => e | |
# p e.message | |
get_random_proxy | |
end | |
end | |
loop do | |
begin | |
AliParse.download_item_images(url, false, true) | |
break | |
rescue Exception => e | |
# p e.message | |
get_random_proxy | |
end | |
end | |
print "\n" | |
end | |
output.close | |
puts "\nJob is done, written to #{ filename }" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment