Skip to content

Instantly share code, notes, and snippets.

@arjunvenkat
Last active December 14, 2015 06:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arjunvenkat/5042486 to your computer and use it in GitHub Desktop.
Save arjunvenkat/5042486 to your computer and use it in GitHub Desktop.
first and second part of the scraper lab. We scraped data from craigslist and saved it into a csv file
require 'open-uri'
require 'nokogiri'
require 'mechanize'
agent = Mechanize.new
page = agent.get('http://www.reddit.com/')
count = 0
# prints out the first link for the first five pages
while count < 5
puts page.search('#siteTable .title .title').first.text
if page.search('.nextprev a').count > 1
page = agent.get(page.search('.nextprev a')[1].attr('href'))
else
page = agent.get(page.search('.nextprev a').attr('href'))
end
count += 1
end
require 'open-uri'
require 'nokogiri'
require 'mechanize'
require 'csv'
agent = Mechanize.new
url = 'http://chicago.craigslist.org/apa/'
page = agent.get(url)
CSV.open("scraping_data_pt2.csv", "w+") do |csv|
csv << ["description", "price", "bedrooms", "location"]
end
count = 0
while page.link_with(text: 'next 100 postings')
listings = page.search('.row')
listings_array = listings.map do |listing|
listing_hash = {}
listing_hash['description'] = listing.css('a').text
price_bedrooms_size = listing.css('.itemph').text
if price_bedrooms_size.scan("/").length > 0
split_on_slash = price_bedrooms_size.split("/")
price = split_on_slash[0].split("$")[1].to_i
bedrooms = split_on_slash[1].split("-")[0].split("br")[0].to_i
listing_hash['price'] = price
listing_hash['bedrooms'] = bedrooms
elsif price_bedrooms_size.scan("$")
price = price_bedrooms_size.split("$")[1].to_i
listing_hash['price'] = price
listing_hash['bedrooms'] = 'N/A'
elsif price_bedrooms_size.scan("br")
listing_hash['price'] = 'N/A'
bedrooms = price_bedrooms_size.split("br")[0].to_i
listing_hash['bedrooms'] = bedrooms
end
if listing_hash['location'] != ""
listing_hash['location'] = listing.css('.itempn').text
else
listing_hash['location'] = 'N/A'
end
listing_hash
end
count += 100
CSV.open("scraping_data_pt2.csv", "a+") do |csv|
listings_array.each do |listing|
a = []
a << listing["description"]
a << listing["price"]
a << listing["bedrooms"]
a << listing["location"]
csv << a
end
end
puts
puts "#{count} listings saved"
puts "============================================"
puts
page = page.link_with(text: 'next 100 postings').click
end
require 'open-uri'
require 'nokogiri'
require 'csv'
url = 'http://chicago.craigslist.org/apa/'
doc = Nokogiri::HTML(open(url))
listings = doc.css('.row')
listings_array = listings.map do |listing|
listing_hash = {}
listing_hash['description'] = listing.css('a').text
price_bedrooms_size = listing.css('.itemph').text
if price_bedrooms_size.scan("/").length > 0
split_on_slash = price_bedrooms_size.split("/")
price = split_on_slash[0].split("$")[1].to_i
bedrooms = split_on_slash[1].split("-")[0].split("br")[0].to_i
listing_hash['price'] = price
listing_hash['bedrooms'] = bedrooms
elsif price_bedrooms_size.scan("$")
price = price_bedrooms_size.split("$")[1].to_i
listing_hash['price'] = price
listing_hash['bedrooms'] = 'N/A'
elsif price_bedrooms_size.scan("br")
listing_hash['price'] = 'N/A'
bedrooms = price_bedrooms_size.split("br")[0].to_i
listing_hash['bedrooms'] = bedrooms
end
if listing_hash['location'] != ""
listing_hash['location'] = listing.css('.itempn').text
else
listing_hash['location'] = 'N/A'
end
listing_hash
end
CSV.open("scraping_data.csv", "w+") do |csv|
csv << ["description", "price", "bedrooms", "location"]
listings_array.each do |listing|
a = []
a << listing["description"]
a << listing["price"]
a << listing["bedrooms"]
a << listing["location"]
csv << a
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment