Created
March 22, 2015 02:30
-
-
Save maetl/ecc79023f5b355499991 to your computer and use it in GitHub Desktop.
Screen scraping 101
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population | |
Start by making a new project with a Rakefile (no Rails!) | |
We will pretend that the list/table might be changing all the time, so we want something repeatable to extract the text data from it | |
Write Rake tasks to download the HTML, extract the list of countries by population, and generate both JSON and CSV files. | |
No backend stuff, no browser stuff. Just a pure Ruby command line script. | |
The extra challenge is to scrape a separate table and merge the ISO country codes to each item | |
http://en.wikipedia.org/wiki/ISO_3166-1 | |
I recommend using Nokogiri rather than regexes or anything crazy to parse the text. | |
http://www.nokogiri.org/tutorials/parsing_an_html_xml_document.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'json' | |
POPULATION = 'List_of_countries_and_dependencies_by_population' | |
ISO_CODES = 'ISO_3166-1' | |
COUNTRY_CODES = {} | |
COUNTRIES = [] | |
def make_country(code, name, population) | |
COUNTRIES << { | |
code: code, | |
name: name, | |
population: population | |
} | |
end | |
def download_wikipedia_page(page_name) | |
unless File.exists?(Dir.pwd + "/#{page_name}") | |
sh "wget https://en.wikipedia.org/wiki/#{page_name}" | |
end | |
end | |
def load_doc(page_name) | |
Nokogiri::HTML(File.read(Dir.pwd + "/#{page_name}")) | |
end | |
def write_file(name, data) | |
File.open(name, 'w') do |file| | |
file.puts(data) | |
end | |
end | |
task :download do | |
download_wikipedia_page(POPULATION) | |
download_wikipedia_page(ISO_CODES) | |
end | |
task extract_codes: :download do | |
doc = load_doc(ISO_CODES) | |
doc.css('table.wikitable tr').each do |tr| | |
name = tr.css('td:nth-child(1)').text | |
code = tr.css('td:nth-child(5)').text | |
if name && code | |
COUNTRY_CODES[name] = code.split(':').last | |
end | |
end | |
end | |
task extract_population: :extract_codes do | |
doc = load_doc(POPULATION) | |
doc.css('table.wikitable tr').each do |tr| | |
link_titles = tr.css('a[title]') | |
case link_titles.count | |
when 1 | |
name = tr.css('a[title]').text | |
when 2 | |
name = tr.css('a[title]').first.text | |
end | |
if name | |
population = tr.css('td:nth-child(3)').text.gsub(',', '').to_i | |
make_country(COUNTRY_CODES[name], name, population) if COUNTRY_CODES.key?(name) | |
end | |
end | |
end | |
EOL = "\n".freeze | |
task build_csv: :extract_population do | |
COUNTRIES_CSV = COUNTRIES.inject('code,name,population') do |csv, country| | |
csv << "#{country[:code]},#{country[:name]},#{country[:population]}" + EOL | |
end | |
end | |
task build_json: :extract_population do | |
COUNTRIES_JSON = JSON.pretty_generate(COUNTRIES) | |
end | |
task print_csv: :build_csv do | |
puts COUNTRIES_CSV | |
end | |
task csv: :build_csv do | |
write_file('countries.csv', COUNTRIES_CSV) | |
end | |
task json: :build_json do | |
write_file('countries.json', COUNTRIES_JSON) | |
end | |
task print_json: :build_json do | |
puts COUNTRIES_JSON | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment