Skip to content

Instantly share code, notes, and snippets.

@Thermatix
Last active March 2, 2017 22:45
Show Gist options
  • Save Thermatix/0494384fd82cc90d061b140d8d6e7784 to your computer and use it in GitHub Desktop.
Save Thermatix/0494384fd82cc90d061b140d8d6e7784 to your computer and use it in GitHub Desktop.
class Node
include Enumerable
attr_accessor :childs,:init, :data, :ident
def initialize(init={},ident=nil)
@ident = ident
@childs = {}
@init = init.dup
@data = init.dup
end
def each(&block)
block.call(self)
@childs.map do |key,child|
child.each(&block)
end
end
def <=>(other_node)
@data <=> other_node.data
end
def [](*keys)
return self if @childs.empty?
[*keys.flatten].inject(self) do |node,ident|
node.find {|n| n.ident == ident}
end
end
def new_child(ident,&block)
child = self.class.new(@init,ident)
child.tap(&block) if block_given?
@childs[ident] = child
end
def pretty_print(pp)
self.each {|node| pp.text(node.ident || "" );puts "\n";pp.pp_hash node.data}
end
end
class Servo_City < Scraper
set_up
set_url "https://www.servocity.com"
X_Rate = {
aud: 1.3
}
re_useable :define_fields do
xpath_prefix "//div[@class = 'product-view']" do
set_field :title, "//div[@class = 'product-name']//h1"
set_field :description, "//div[@class = 'std']",{scrub_tags: true}
xpath_prefix "//div[@class = 'product-shop']" do
set_field :supplier_rrp, "//span[@class = 'price']", {after: :transform_rrp}
set_field :mpn, "//div[@id ='skuInfo']"
end
set_field :supplier_product_url, "",{page_url: true}
xpath_prefix "//table[@class = 'data-table']" do
set_field :weight, "//td[@class = 'data last' and contains(text(), 'lbs')]", {default: 100}
end
set_field :price, "", {after: :transform_rrp_to_usd}
# set_field :retired, ""
# set_field :moq
end
end
sub_page(:main_menu,"//div[contains(@class,'main-container')]//ul[contains(@class,'itemparent')]//li//a") do
# select_sub_page
recursive_select({
category: "//ul[contains(@class,'apptrian-subcategories')]//li//a",
product: "//div[contains(@class,'category-products')]//li//a",
product_section: "//table[contains(@class,'zebra')]//td[1]//a"
}) do
scrape_here_if({fields: [:supplier_rrp]})
use :define_fields
end
end
def transform_rrp(string,field_data)
string.gsub(/\$/,'')
end
def transform_rrp_to_usd(string,field_data)
field_data[:supplier_rrp].to_f * X_Rate[:aud]
end
@tree.each {|n| ap "----------{%s}----------" % n.ident ;n.data.each {|k,v| ap "======={%s:%s}======" % [n.ident,k]; ap v}}
end
require_relative "node"
class Scraper
module DSL
def set_up
unless @setup
@tree = Node.new({
fields: {},
meta: {},
recursive_select: false,
sub_page_tag: {},
select: false,
scrape_if: nil
})
@tree_pointer = []
@xpath_prefix = []
@useables = {}
@writer = nil
@base_url = ""
@inc = 1
@setup = true
end
end
def tree_down(key,childs=false)
@tree_pointer << key
yield
@tree_pointer.pop
end
def re_useable(name,&block)
check_for_block(&block)
@useables[name] = block
end
def use(name)
instance_exec(&@useables[name])
end
def set_url(url)
@base_url = ""
end
def set_pag_increment(value)
@inc = value
end
def xpath_prefix(prefix,&block)
check_for_block(&block)
@xpath_prefix << prefix
yield
@xpath_prefix.pop
end
def scrape_here_if(args=nil,&block)
@tree[@tree_pointer].data[:scrape_if] = args || {block: block}
end
def select_sub_page
@tree[@tree_pointer].data[:select] = true
end
#recursivly drill into page
def recursive_select(page_tags,&block)
page_tags.each do |item,path|
sub_page(item,path) do
@tree[@tree_pointer].data[:recursive_select] = true
block.call
end
end
end
#drill down into page
def sub_page(item,path)
@tree[@tree_pointer].new_child(item).data[:sub_page_tag] = join_xpath(path)
tree_down(item) do
yield
end
end
def set_field(name,xpath,meta={})
set_info(:fields,name,xpath,meta)
end
def set_meta_tag(name,xpath,meta={})
set_info(:meta,name,xpath,meta)
end
def writer(&block)
@writer = block
end
private
def check_for_block(&block)
raise "No block given for #%s" % caller[0][/`.*'/][1..-2] unless block_given?
end
def set_info(type,name,xpath,meta)
@tree[@tree_pointer].data[type][name] = {
xpath: join_xpath(xpath),
meta: meta
}
end
def join_xpath(tag)
tag.empty? ? tag : @xpath_prefix.join + tag
end
end
end
V this is the pointer before the code block
[
[0] :main_menu,
[1] :category
]
:defining_fields
V this is the pointer inside the code block
[
[0] :main_menu,
[1] :category
]
V this is the pointer before the code block
[
[0] :main_menu,
[1] :product
]
:defining_fields
V this is the pointer inside the code block
[
[0] :main_menu,
[1] :product
]
V this is the pointer before the code block
[
[0] :main_menu,
[1] :product_section
]
:defining_fields
V this is the pointer inside the code block
[
[0] :main_menu,
[1] :product_section
]
"----------{}----------"
"======={:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={:meta}======"
{}
"======={:recursive_select}======"
false
"======={:sub_page_tag}======"
{}
"======={:select}======"
false
"======={:scrape_if}======"
nil
"----------{main_menu}----------"
"======={main_menu:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={main_menu:meta}======"
{}
"======={main_menu:recursive_select}======"
false
"======={main_menu:sub_page_tag}======"
"//div[contains(@class,'main-container')]//ul[contains(@class,'itemparent')]//li//a"
"======={main_menu:select}======"
false
"======={main_menu:scrape_if}======"
nil
"----------{category}----------"
"======={category:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={category:meta}======"
{}
"======={category:recursive_select}======"
true
"======={category:sub_page_tag}======"
"//ul[contains(@class,'apptrian-subcategories')]//li//a"
"======={category:select}======"
false
"======={category:scrape_if}======"
{
:fields => [
[0] :supplier_rrp
]
}
"----------{product}----------"
"======={product:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={product:meta}======"
{}
"======={product:recursive_select}======"
true
"======={product:sub_page_tag}======"
"//div[contains(@class,'category-products')]//li//a"
"======={product:select}======"
false
"======={product:scrape_if}======"
{
:fields => [
[0] :supplier_rrp
]
}
"----------{product_section}----------"
"======={product_section:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={product_section:meta}======"
{}
"======={product_section:recursive_select}======"
true
"======={product_section:sub_page_tag}======"
"//table[contains(@class,'zebra')]//td[1]//a"
"======={product_section:select}======"
false
"======={product_section:scrape_if}======"
{
:fields => [
[0] :supplier_rrp
]
}
Thermatix@Martins-MBP  ~/dev/upwork/littlebirdelectronics/webscraper.servocity  git:(upstream ⚡ master) 9A ./scrape
"----------{}----------"
"======={:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={:meta}======"
{}
"======={:recursive_select}======"
false
"======={:sub_page_tag}======"
{}
"======={:select}======"
false
"======={:scrape_if}======"
nil
"----------{main_menu}----------"
"======={main_menu:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={main_menu:meta}======"
{}
"======={main_menu:recursive_select}======"
false
"======={main_menu:sub_page_tag}======"
"//div[contains(@class,'main-container')]//ul[contains(@class,'itemparent')]//li//a"
"======={main_menu:select}======"
false
"======={main_menu:scrape_if}======"
nil
"----------{category}----------"
"======={category:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={category:meta}======"
{}
"======={category:recursive_select}======"
true
"======={category:sub_page_tag}======"
"//ul[contains(@class,'apptrian-subcategories')]//li//a"
"======={category:select}======"
false
"======={category:scrape_if}======"
{
:fields => [
[0] :supplier_rrp
]
}
"----------{product}----------"
"======={product:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={product:meta}======"
{}
"======={product:recursive_select}======"
true
"======={product:sub_page_tag}======"
"//div[contains(@class,'category-products')]//li//a"
"======={product:select}======"
false
"======={product:scrape_if}======"
{
:fields => [
[0] :supplier_rrp
]
}
"----------{product_section}----------"
"======={product_section:fields}======"
{
:title => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-name']//h1",
:meta => {}
},
:description => {
:xpath => "//div[@class = 'product-view']//div[@class = 'std']",
:meta => {
:scrub_tags => true
}
},
:supplier_rrp => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//span[@class = 'price']",
:meta => {
:after => :transform_rrp
}
},
:mpn => {
:xpath => "//div[@class = 'product-view']//div[@class = 'product-shop']//div[@id ='skuInfo']",
:meta => {}
},
:supplier_product_url => {
:xpath => "",
:meta => {
:page_url => true
}
},
:weight => {
:xpath => "//div[@class = 'product-view']//table[@class = 'data-table']//td[@class = 'data last' and contains(text(), 'lbs')]",
:meta => {
:default => 100
}
},
:price => {
:xpath => "",
:meta => {
:after => :transform_rrp_to_usd
}
}
}
"======={product_section:meta}======"
{}
"======={product_section:recursive_select}======"
true
"======={product_section:sub_page_tag}======"
"//table[contains(@class,'zebra')]//td[1]//a"
"======={product_section:select}======"
false
"======={product_section:scrape_if}======"
{
:fields => [
[0] :supplier_rrp
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment