-
-
Save Thermatix/0494384fd82cc90d061b140d8d6e7784 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Node | |
include Enumerable | |
attr_accessor :childs,:init, :data, :ident | |
def initialize(init={},ident=nil) | |
@ident = ident | |
@childs = {} | |
@init = init.dup | |
@data = init.dup | |
end | |
def each(&block) | |
block.call(self) | |
@childs.map do |key,child| | |
child.each(&block) | |
end | |
end | |
def <=>(other_node) | |
@data <=> other_node.data | |
end | |
def [](*keys) | |
return self if @childs.empty? | |
[*keys.flatten].inject(self) do |node,ident| | |
node.find {|n| n.ident == ident} | |
end | |
end | |
def new_child(ident,&block) | |
child = self.class.new(@init,ident) | |
child.tap(&block) if block_given? | |
@childs[ident] = child | |
end | |
def pretty_print(pp) | |
self.each {|node| pp.text(node.ident || "" );puts "\n";pp.pp_hash node.data} | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Servo_City < Scraper | |
set_up | |
set_url "https://www.servocity.com" | |
X_Rate = { | |
aud: 1.3 | |
} | |
re_useable :define_fields do | |
xpath_prefix "//div[@class = 'product-view']" do | |
set_field :title, "//div[@class = 'product-name']//h1" | |
set_field :description, "//div[@class = 'std']",{scrub_tags: true} | |
xpath_prefix "//div[@class = 'product-shop']" do | |
set_field :supplier_rrp, "//span[@class = 'price']", {after: :transform_rrp} | |
set_field :mpn, "//div[@id ='skuInfo']" | |
end | |
set_field :supplier_product_url, "",{page_url: true} | |
xpath_prefix "//table[@class = 'data-table']" do | |
set_field :weight, "//td[@class = 'data last' and contains(text(), 'lbs')]", {default: 100} | |
end | |
set_field :price, "", {after: :transform_rrp_to_usd} | |
# set_field :retired, "" | |
# set_field :moq | |
end | |
end | |
sub_page(:main_menu,"//div[contains(@class,'main-container')]//ul[contains(@class,'itemparent')]//li//a") do | |
# select_sub_page | |
recursive_select({ | |
category: "//ul[contains(@class,'apptrian-subcategories')]//li//a", | |
product: "//div[contains(@class,'category-products')]//li//a", | |
product_section: "//table[contains(@class,'zebra')]//td[1]//a" | |
}) do | |
scrape_here_if({fields: [:supplier_rrp]}) | |
use :define_fields | |
end | |
end | |
def transform_rrp(string,field_data) | |
string.gsub(/\$/,'') | |
end | |
def transform_rrp_to_usd(string,field_data) | |
field_data[:supplier_rrp].to_f * X_Rate[:aud] | |
end | |
@tree.each {|n| ap "----------{%s}----------" % n.ident ;n.data.each {|k,v| ap "======={%s:%s}======" % [n.ident,k]; ap v}} | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative "node" | |
class Scraper | |
module DSL | |
def set_up | |
unless @setup | |
@tree = Node.new({ | |
fields: {}, | |
meta: {}, | |
recursive_select: false, | |
sub_page_tag: {}, | |
select: false, | |
scrape_if: nil | |
}) | |
@tree_pointer = [] | |
@xpath_prefix = [] | |
@useables = {} | |
@writer = nil | |
@base_url = "" | |
@inc = 1 | |
@setup = true | |
end | |
end | |
def tree_down(key,childs=false) | |
@tree_pointer << key | |
yield | |
@tree_pointer.pop | |
end | |
def re_useable(name,&block) | |
check_for_block(&block) | |
@useables[name] = block | |
end | |
def use(name) | |
instance_exec(&@useables[name]) | |
end | |
def set_url(url) | |
@base_url = "" | |
end | |
def set_pag_increment(value) | |
@inc = value | |
end | |
def xpath_prefix(prefix,&block) | |
check_for_block(&block) | |
@xpath_prefix << prefix | |
yield | |
@xpath_prefix.pop | |
end | |
def scrape_here_if(args=nil,&block) | |
@tree[@tree_pointer].data[:scrape_if] = args || {block: block} | |
end | |
def select_sub_page | |
@tree[@tree_pointer].data[:select] = true | |
end | |
#recursivly drill into page | |
def recursive_select(page_tags,&block) | |
page_tags.each do |item,path| | |
sub_page(item,path) do | |
@tree[@tree_pointer].data[:recursive_select] = true | |
block.call | |
end | |
end | |
end | |
#drill down into page | |
def sub_page(item,path) | |
@tree[@tree_pointer].new_child(item).data[:sub_page_tag] = join_xpath(path) | |
tree_down(item) do | |
yield | |
end | |
end | |
def set_field(name,xpath,meta={}) | |
set_info(:fields,name,xpath,meta) | |
end | |
def set_meta_tag(name,xpath,meta={}) | |
set_info(:meta,name,xpath,meta) | |
end | |
def writer(&block) | |
@writer = block | |
end | |
private | |
def check_for_block(&block) | |
raise "No block given for #%s" % caller[0][/`.*'/][1..-2] unless block_given? | |
end | |
def set_info(type,name,xpath,meta) | |
@tree[@tree_pointer].data[type][name] = { | |
xpath: join_xpath(xpath), | |
meta: meta | |
} | |
end | |
def join_xpath(tag) | |
tag.empty? ? tag : @xpath_prefix.join + tag | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
V this is the pointer before the code block | |
[ | |
[0] :main_menu, | |
[1] :category | |
] | |
:defining_fields | |
V this is the pointer inside the code block | |
[ | |
[0] :main_menu, | |
[1] :category | |
] | |
V this is the pointer before the code block | |
[ | |
[0] :main_menu, | |
[1] :product | |
] | |
:defining_fields | |
V this is the pointer inside the code block | |
[ | |
[0] :main_menu, | |
[1] :product | |
] | |
V this is the pointer before the code block | |
[ | |
[0] :main_menu, | |
[1] :product_section | |
] | |
:defining_fields | |
V this is the pointer inside the code block | |
[ | |
[0] :main_menu, | |
[1] :product_section | |
] | |
"----------{}----------" | |
"======={:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={:meta}======" | |
{} | |
"======={:recursive_select}======" | |
false | |
"======={:sub_page_tag}======" | |
{} | |
"======={:select}======" | |
false | |
"======={:scrape_if}======" | |
nil | |
"----------{main_menu}----------" | |
"======={main_menu:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={main_menu:meta}======" | |
{} | |
"======={main_menu:recursive_select}======" | |
false | |
"======={main_menu:sub_page_tag}======" | |
"//div[contains(@class,'main-container')]//ul[contains(@class,'itemparent')]//li//a" | |
"======={main_menu:select}======" | |
false | |
"======={main_menu:scrape_if}======" | |
nil | |
"----------{category}----------" | |
"======={category:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={category:meta}======" | |
{} | |
"======={category:recursive_select}======" | |
true | |
"======={category:sub_page_tag}======" | |
"//ul[contains(@class,'apptrian-subcategories')]//li//a" | |
"======={category:select}======" | |
false | |
"======={category:scrape_if}======" | |
{ | |
:fields => [ | |
[0] :supplier_rrp | |
] | |
} | |
"----------{product}----------" | |
"======={product:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={product:meta}======" | |
{} | |
"======={product:recursive_select}======" | |
true | |
"======={product:sub_page_tag}======" | |
"//div[contains(@class,'category-products')]//li//a" | |
"======={product:select}======" | |
false | |
"======={product:scrape_if}======" | |
{ | |
:fields => [ | |
[0] :supplier_rrp | |
] | |
} | |
"----------{product_section}----------" | |
"======={product_section:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={product_section:meta}======" | |
{} | |
"======={product_section:recursive_select}======" | |
true | |
"======={product_section:sub_page_tag}======" | |
"//table[contains(@class,'zebra')]//td[1]//a" | |
"======={product_section:select}======" | |
false | |
"======={product_section:scrape_if}======" | |
{ | |
:fields => [ | |
[0] :supplier_rrp | |
] | |
} | |
Thermatix@Martins-MBP ~/dev/upwork/littlebirdelectronics/webscraper.servocity git:(upstream ⚡ master) 9A ./scrape | |
"----------{}----------" | |
"======={:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={:meta}======" | |
{} | |
"======={:recursive_select}======" | |
false | |
"======={:sub_page_tag}======" | |
{} | |
"======={:select}======" | |
false | |
"======={:scrape_if}======" | |
nil | |
"----------{main_menu}----------" | |
"======={main_menu:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={main_menu:meta}======" | |
{} | |
"======={main_menu:recursive_select}======" | |
false | |
"======={main_menu:sub_page_tag}======" | |
"//div[contains(@class,'main-container')]//ul[contains(@class,'itemparent')]//li//a" | |
"======={main_menu:select}======" | |
false | |
"======={main_menu:scrape_if}======" | |
nil | |
"----------{category}----------" | |
"======={category:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={category:meta}======" | |
{} | |
"======={category:recursive_select}======" | |
true | |
"======={category:sub_page_tag}======" | |
"//ul[contains(@class,'apptrian-subcategories')]//li//a" | |
"======={category:select}======" | |
false | |
"======={category:scrape_if}======" | |
{ | |
:fields => [ | |
[0] :supplier_rrp | |
] | |
} | |
"----------{product}----------" | |
"======={product:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={product:meta}======" | |
{} | |
"======={product:recursive_select}======" | |
true | |
"======={product:sub_page_tag}======" | |
"//div[contains(@class,'category-products')]//li//a" | |
"======={product:select}======" | |
false | |
"======={product:scrape_if}======" | |
{ | |
:fields => [ | |
[0] :supplier_rrp | |
] | |
} | |
"----------{product_section}----------" | |
"======={product_section:fields}======" | |
{ | |
:title => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1", | |
:meta => {} | |
}, | |
:description => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'std']", | |
:meta => { | |
:scrub_tags => true | |
} | |
}, | |
:supplier_rrp => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']", | |
:meta => { | |
:after => :transform_rrp | |
} | |
}, | |
:mpn => { | |
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']", | |
:meta => {} | |
}, | |
:supplier_product_url => { | |
:xpath => "", | |
:meta => { | |
:page_url => true | |
} | |
}, | |
:weight => { | |
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]", | |
:meta => { | |
:default => 100 | |
} | |
}, | |
:price => { | |
:xpath => "", | |
:meta => { | |
:after => :transform_rrp_to_usd | |
} | |
} | |
} | |
"======={product_section:meta}======" | |
{} | |
"======={product_section:recursive_select}======" | |
true | |
"======={product_section:sub_page_tag}======" | |
"//table[contains(@class,'zebra')]//td[1]//a" | |
"======={product_section:select}======" | |
false | |
"======={product_section:scrape_if}======" | |
{ | |
:fields => [ | |
[0] :supplier_rrp | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment