Created
June 13, 2013 19:13
-
-
Save stevenabrooks/5776496 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Columns to add to Students table: | |
# Name | |
# Profile Pic | |
# Social media links/ Twitter, LinkedIn, GitHub, Blog(RSS) | |
# Quote | |
# About/ Bio, Education, Work | |
# Coder Cred / Treehouse, Codeschool, Coderwal .reject(Github) | |
require 'nokogiri' | |
require 'open-uri' | |
require 'sqlite3' | |
database = "flatiron.db" | |
index_html = "http://students.flatironschool.com" | |
# Scrape index.html | |
index = Nokogiri::HTML(open(index_html)) | |
student_css_selector = "li.home-blog-post div.blog-title a" # div.big-comment before "a" won't select Matt's profile | |
students = index.css("#{student_css_selector}") | |
# create an array of relative URLs for each student | |
students_html_array = [] | |
students.each do |student| | |
students_html_array << student.attr("href").downcase | |
end | |
puts "\nThe students_html_array looks like this:\n #{students_html_array.inspect}" | |
# Scrape individual student profiles based on the array created from scraping index.html | |
# Create a new database and drop the students table from the database if it exists | |
begin | |
db = SQLite3::Database.new database | |
db = SQLite3::Database.open database | |
db.execute("DROP TABLE IF EXISTS Students") | |
rescue SQLite3::Exception => e | |
puts "Exception occurred" | |
puts e | |
ensure | |
db.close if db | |
end | |
# Loop through each student profile URL in the array and insert all the info as a row in the students table | |
students_html_array.each do |student_html| | |
if student_html != "#" #only scrape page if page linked to from index.html exists | |
begin | |
puts # empty row | |
student_page = Nokogiri::HTML(open("#{index_html}/#{student_html}")) | |
# Get student's name | |
name_css_selector = "h4.ib_main_header" | |
html_tag_for_name = student_page.css("#{name_css_selector}").first # will return nil if the ib_main_header css selector is not found | |
puts html_tag_for_name.class | |
# only scrape the rest of page if html_tag_for_name is found (to make sure that only correctly formatted pages are scraped) | |
if html_tag_for_name | |
student_name = html_tag_for_name.content | |
puts "...scraping: #{student_name}" | |
# Get social media links | |
social_media_selector = "div.social-icons a" | |
social_media_twitter = student_page.css("#{social_media_selector}" )[0].attr("href") | |
# Get rest of columns for students | |
# CODE TO BE FILLED | |
# start manipulating the database | |
# open the database | |
db = SQLite3::Database.open database | |
# create Students table if it doesn't exist | |
db.execute("CREATE TABLE IF NOT EXISTS Students(id INTEGER PRIMARY KEY AUTOINCREMENT, | |
name TEXT, | |
twitter_link TEXT | |
)" | |
) | |
# insert specific student into Students table if it doesn't exist | |
db.execute("DELETE FROM Students WHERE name=?", student_name) | |
db.execute("INSERT INTO Students(name, | |
twitter_link | |
) | |
VALUES(?,?)", | |
student_name, | |
social_media_twitter | |
) | |
else | |
puts "#{student_html} isn't the correct template. Page will not be scraped." | |
end # end if html_tag_for_name doesn't exist | |
rescue OpenURI::HTTPError => e | |
puts "No profile found at " + student_html | |
puts e | |
rescue SQLite3::Exception => e | |
puts "SQLite3 Exception occurred" | |
puts e | |
ensure | |
db.close if db | |
end # end the begin-rescues block (potential errors: OpenURI::HTTPError, SQLite3::Exception) | |
end # end the if student_html != "#" block | |
end # end the loop students_html_array.each |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment