-
-
Save nerdoncall/0aaffb6f0076b1d0c630 to your computer and use it in GitHub Desktop.
Trying to scrape our website and create a csv file. Noobie here. Getting error on the copying of the data from the hash to the csv. I dont know what I have screwed up. Please help.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'mechanize' | |
require 'csv' | |
BASEURL = "http://www.pbm-erson.com" | |
BASEURL1 = "http://www.pbm-erson.com/Catalog/PBM/Camshafts_PBM/Performance_CPBM?&sortby=2&pageSize=5&page=" | |
HSH = Hash.new | |
RESULTS = [] | |
BRY = Array.new #attribute name array from products | |
LAST_PAGE_NUMBER = 2 #last page to stop on | |
ARR1 = [] | |
$bsv | |
#go through and get links for each product page | |
CSV.open("results-test.csv","w") do |csv| | |
#$bsv = csv | |
#def csvwrite | |
# csv << $bsv | |
#end | |
def csvheader (bry) | |
#csv << bry | |
puts bry | |
end | |
def find_link | |
puts " Finding Links" | |
item_info_css = ".item-info" | |
product_item_row = ".item-row" | |
product_num = "span.item-num" | |
for current_page_number in 1..LAST_PAGE_NUMBER | |
url = "#{BASEURL1}#{current_page_number}" | |
puts url | |
doc = Nokogiri::HTML(open(url)) | |
products = doc.css(item_info_css) | |
products.each do |product| | |
product_nodes = product.css(product_num) | |
links = product.css('a') | |
linkref = links[0]["href"] | |
raw_results = BASEURL + linkref | |
RESULTS << raw_results | |
sleep 1 | |
end | |
end | |
puts RESULTS#finding links | |
end #find_link | |
def build_att_name_array | |
BRY.push("ProductID:") | |
BRY.push("Name:") | |
BRY.push("Short Description:") | |
BRY.push("HTML Content:")#building attribute name list | |
puts "Building Table Headers" | |
RESULTS.each do |link| | |
url = link | |
doc = Nokogiri::HTML(open(url)) | |
doc.css(".attribute-list-item").each do |attrib| | |
att = attrib.at_css("span.attrib-name").text | |
BRY << att | |
end | |
sleep 1 | |
end | |
end #build_att_name_array | |
#need to verify the lining up of the code from here down | |
def find_att_value | |
puts "Matching attributes to attribute name" | |
hry = BRY.uniq | |
RESULTS.each do |link| | |
url = link | |
doc = Nokogiri::HTML(open(url)) | |
erp_num = doc.at_css("span.erp-num").text.strip #our part number from Sxe | |
short_desc = doc.at_css(".pd-header").text.strip # short description | |
long_desc = doc.at_css(".product-cm").text.strip #html content | |
id = doc.at_css("input#CurrentProductId")['value'] #ID number in console | |
hry.each do |ab| | |
if ab == "ProductID:" | |
HSH[ab] = id | |
elsif ab == "Name:" | |
HSH[ab] = erp_num | |
elsif ab == "Short Description:" | |
HSH[ab] = short_desc | |
elsif ab == "HTML Content:" | |
HSH[ab] = long_desc | |
else | |
begin | |
els = doc.search"[text()*='#{ab}']" | |
el = els.first | |
eln = el.next_element.text.strip | |
HSH[ab] = eln | |
rescue | |
HSH[ab] = "" | |
end #begin | |
end #if | |
puts "-------------------------" | |
puts HSH | |
puts "-------------------------" | |
HSH.each_value do |val| | |
ARR1 << val | |
end #HSH | |
csv << ARR1 # ERROR - undefined local variable or method `csv' | |
ARR1.clear | |
HSH.clear | |
end #hry | |
end#RESULTS | |
end #find_att_value | |
find_link | |
build_att_name_array | |
bry = BRY.uniq | |
csv << bry | |
csvheader(bry) | |
find_att_value | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment