Skip to content

Instantly share code, notes, and snippets.

@vladfaust
Last active May 8, 2016 17:25
Show Gist options
  • Save vladfaust/a597c99d551413676b95bc53200bb1b4 to your computer and use it in GitHub Desktop.
Save vladfaust/a597c99d551413676b95bc53200bb1b4 to your computer and use it in GitHub Desktop.
Small code snippets to grab from Aliexpress
require 'open-uri'
require 'nokogiri'
require 'active_support'
require 'fileutils'
module Proxy
@@proxy = nil
def self.get_proxy
@@proxy
end
def self.set_proxy(proxy)
@@proxy = proxy
end
def self.get_proxy_list
uri = 'http://hideme.ru/proxy-list/?maxtime=1000&type=h#list'
doc = Nokogiri::HTML(open(uri))
list = []
proxies = doc.css('.proxy__t tbody tr')
proxies.each do |proxy|
list << { ip: proxy.css('td')[0].text, port: proxy.css('td')[1].text.to_i }
end
list
end
end
module AliParse
def self.clear_url(url)
new_url = URI.unescape(url).match(/([^?#\s$]+)/)[0]
raise('Invalid URL') unless new_url[/^(https?:\/\/)?(\w*.)?aliexpress.com\/(item\/|store\/product\/)/]
new_url
end
def self.remove_int(url)
url.gsub(url.match(/(\w+\.)aliexpress\.com/)[1], '')
end
def self.open_url(url)
Nokogiri::HTML(open(url, proxy: Proxy.get_proxy))
end
def self.get_aliexpress_item_info (input_url)
url = clear_url(input_url)
currency = input_url.match(/currencyType=(\w+)[&|$]/)&.captures&.first || 'USD'
url = "#{ url }?expanded=true#{ ('&currencyType=' + currency) if currency }"
doc = open_url(url)
begin
breadcrumbs = doc.css('.ui-breadcrumb a')
breadcrumbs.shift
breadcrumbs.shift
category = breadcrumbs[0].attribute('href').value.match(/\/([\w\-]*)\.html/).captures[0]
subcategory = (breadcrumbs[1].attribute('href').value.match(/\/([\w\-]*)\.html/).captures[0] if breadcrumbs[1]) || nil
subsubcategory = (breadcrumbs[2].attribute('href').value.match(/\/([\w\-]*)\.html/).captures[0] if breadcrumbs[2]) || nil
rescue
# ignored
# TODO make stores breadcumbs parse
end
id = doc.css('#hid-product-id')[0]&.attribute('value')&.value
name = doc.css('img[data-role="thumb"]')[0]&.attribute('title')&.value
image_src = doc.css('img[data-role="thumb"]')[0]&.attribute('src')&.value
reviews = doc.css('[itemprop="reviewCount"]')[0]&.text&.to_i
rating = doc.css('[itemprop="ratingValue"]')[0]&.text&.to_f
high_price = doc.css('[itemprop="highPrice"]')[0]&.text&.to_f
low_price = doc.css('[itemprop="lowPrice"]')[0]&.text&.to_f
currency = doc.css('[itemprop="priceCurrency"]')[0]&.attribute('content')&.value
price = doc.css('[itemprop="price"]')[0]&.text&.to_f || (high_price + low_price) / 2.0
store_number = doc.css('#j-store-header .s-alitalk a')[0]&.attribute('data-id2')&.value
store_name = doc.css('#j-store-header .shop-name a')[0]&.text
store_rank = doc.css('#j-store-header .store-rank span') # TODO: Requires JS
{
id: id,
category: category,
subcategory: subcategory,
subsubcategory: subsubcategory,
name: name,
image_src: image_src,
price: price,
high_price: high_price,
low_price: low_price,
currency: currency,
url: url,
reviews: reviews,
rating: rating,
store_number: store_number,
store_name: store_name,
store_rank: store_rank
}
end
def self.download_item_images(input_url, main = true, description = false)
url = remove_int(clear_url(input_url))
if main
_url = url
some_trash = _url.match(/\/(\d+_)\d+.html$/)&.[](1)
_url = _url.gsub(some_trash, '') if some_trash
_url = _url.gsub(url.match(/(item\/|store\/product\/)/)[1], 'item-img/')
doc = open_url(_url)
item_id = doc.css('#atc-product-id')[0]&.attribute('value').value
unless File.directory?(item_id)
FileUtils.mkdir_p(item_id)
end
images = doc.css('.image ul li a img')
images.each_with_index do |image, index|
File.open(File.join("#{ item_id }", "main_#{ index }.#{ image['src'].match(/\.(\w+)$/)[1] }"), 'wb') do |file|
# puts "Writing main_#{ index }.#{ image['src'].match(/\.(\w+)$/)[1] }..."
file.write(open(image['src']).read)
end
end
puts "Downloaded #{ images.length } main images to /#{ item_id }"
end
if description
doc = open_url(url)
item_id = doc.css('#hid-product-id')[0]&.attribute('value')&.value
unless File.directory?(item_id)
FileUtils.mkdir_p(item_id)
end
description_url = "http://desc.aliexpress.com/getDescModuleAjax.htm?productId=#{ item_id }"
description = open_url(description_url)
images = description.css('img')
images.each_with_index do |image, index|
extension = image['src'].match(/\.(jpg|png|gif|jpeg)/)[1]
filename = "description_#{ index }.#{ extension }"
File.open(File.join("#{ item_id }", filename), 'wb') do |file|
# puts "Writing #{ filename }"
file.write(open(image['src']).read)
end
end
puts "Downloaded #{ images.length } description images to /#{ item_id }"
end
true
end
end
def get_random_proxy
sleep 0.5
proxy = Proxy.get_proxy_list.sample
Proxy.set_proxy(URI.parse("http://#{ proxy[:ip] }:#{ proxy[:host] }"))
# puts "Trying proxy #{ Proxy.get_proxy }..."
end
###
puts "Hello!"
filename = "parsed_#{ Time.now.to_i }.txt"
output = File.open(filename, 'w')
get_random_proxy
ARGV.each do |url|
puts "Processing #{ url }..."
loop do
begin
item = AliParse.get_aliexpress_item_info(url)
puts "Parsed item ##{ item[:id] }!"
rescue Exception => e
# puts e.message
get_random_proxy
next
end
# ID
output.print "#{ item[:id] } "
# Image
output.print "=IMAGE(\"#{ item[:image_src] }\") "
# Price
output.print "#{ ActiveSupport::NumberHelper.number_to_rounded(item[:price], precision: 2, significant: false, separator: ',') } "
# Rating / Reviews
output.print "=HYPERLINK(\"#{ item[:url] }\"; \"#{ ActiveSupport::NumberHelper.number_to_rounded(item[:rating], precision: 2, significant: false, separator: ',') } / #{ item[:reviews] }\") "
# Store
output.print "=HYPERLINK(\"aliexpress.com/store/#{ item[:store_number] }\"; \"#{ item[:store_name] }\") "
# Category
output.print "#{ item[:category] } "
# Subcategory
output.print "#{ item[:subcategory] } "
# Subsubcategory
output.print "#{ item[:subsubcategory] } "
output.print "\n"
break
end
loop do
begin
AliParse.download_item_images(url, true, false)
break
rescue Exception => e
# p e.message
get_random_proxy
end
end
loop do
begin
AliParse.download_item_images(url, false, true)
break
rescue Exception => e
# p e.message
get_random_proxy
end
end
print "\n"
end
output.close
puts "\nJob is done, written to #{ filename }"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment