Last active
December 14, 2015 06:28
-
-
Save arjunvenkat/5042486 to your computer and use it in GitHub Desktop.
first and second part of the scraper lab. We scraped data from craigslist and saved it into a csv file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'mechanize' | |
agent = Mechanize.new | |
page = agent.get('http://www.reddit.com/') | |
count = 0 | |
# prints out the first link for the first five pages | |
while count < 5 | |
puts page.search('#siteTable .title .title').first.text | |
if page.search('.nextprev a').count > 1 | |
page = agent.get(page.search('.nextprev a')[1].attr('href')) | |
else | |
page = agent.get(page.search('.nextprev a').attr('href')) | |
end | |
count += 1 | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'mechanize' | |
require 'csv' | |
agent = Mechanize.new | |
url = 'http://chicago.craigslist.org/apa/' | |
page = agent.get(url) | |
CSV.open("scraping_data_pt2.csv", "w+") do |csv| | |
csv << ["description", "price", "bedrooms", "location"] | |
end | |
count = 0 | |
while page.link_with(text: 'next 100 postings') | |
listings = page.search('.row') | |
listings_array = listings.map do |listing| | |
listing_hash = {} | |
listing_hash['description'] = listing.css('a').text | |
price_bedrooms_size = listing.css('.itemph').text | |
if price_bedrooms_size.scan("/").length > 0 | |
split_on_slash = price_bedrooms_size.split("/") | |
price = split_on_slash[0].split("$")[1].to_i | |
bedrooms = split_on_slash[1].split("-")[0].split("br")[0].to_i | |
listing_hash['price'] = price | |
listing_hash['bedrooms'] = bedrooms | |
elsif price_bedrooms_size.scan("$") | |
price = price_bedrooms_size.split("$")[1].to_i | |
listing_hash['price'] = price | |
listing_hash['bedrooms'] = 'N/A' | |
elsif price_bedrooms_size.scan("br") | |
listing_hash['price'] = 'N/A' | |
bedrooms = price_bedrooms_size.split("br")[0].to_i | |
listing_hash['bedrooms'] = bedrooms | |
end | |
if listing_hash['location'] != "" | |
listing_hash['location'] = listing.css('.itempn').text | |
else | |
listing_hash['location'] = 'N/A' | |
end | |
listing_hash | |
end | |
count += 100 | |
CSV.open("scraping_data_pt2.csv", "a+") do |csv| | |
listings_array.each do |listing| | |
a = [] | |
a << listing["description"] | |
a << listing["price"] | |
a << listing["bedrooms"] | |
a << listing["location"] | |
csv << a | |
end | |
end | |
puts | |
puts "#{count} listings saved" | |
puts "============================================" | |
puts | |
page = page.link_with(text: 'next 100 postings').click | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'csv' | |
url = 'http://chicago.craigslist.org/apa/' | |
doc = Nokogiri::HTML(open(url)) | |
listings = doc.css('.row') | |
listings_array = listings.map do |listing| | |
listing_hash = {} | |
listing_hash['description'] = listing.css('a').text | |
price_bedrooms_size = listing.css('.itemph').text | |
if price_bedrooms_size.scan("/").length > 0 | |
split_on_slash = price_bedrooms_size.split("/") | |
price = split_on_slash[0].split("$")[1].to_i | |
bedrooms = split_on_slash[1].split("-")[0].split("br")[0].to_i | |
listing_hash['price'] = price | |
listing_hash['bedrooms'] = bedrooms | |
elsif price_bedrooms_size.scan("$") | |
price = price_bedrooms_size.split("$")[1].to_i | |
listing_hash['price'] = price | |
listing_hash['bedrooms'] = 'N/A' | |
elsif price_bedrooms_size.scan("br") | |
listing_hash['price'] = 'N/A' | |
bedrooms = price_bedrooms_size.split("br")[0].to_i | |
listing_hash['bedrooms'] = bedrooms | |
end | |
if listing_hash['location'] != "" | |
listing_hash['location'] = listing.css('.itempn').text | |
else | |
listing_hash['location'] = 'N/A' | |
end | |
listing_hash | |
end | |
CSV.open("scraping_data.csv", "w+") do |csv| | |
csv << ["description", "price", "bedrooms", "location"] | |
listings_array.each do |listing| | |
a = [] | |
a << listing["description"] | |
a << listing["price"] | |
a << listing["bedrooms"] | |
a << listing["location"] | |
csv << a | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment