maetl/Rakefile

## readme.txt
http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population

Start by making a new project with a Rakefile (no Rails!)

We will pretend that the list/table might be changing all the time, so we want something repeatable to extract the text data from it

Write Rake tasks to download the HTML, extract the list of countries by population, and generate both JSON and CSV files.

No backend stuff, no browser stuff. Just a pure Ruby command line script.

The extra challenge is to scrape a separate table and merge the ISO country codes to each item

http://en.wikipedia.org/wiki/ISO_3166-1

I recommend using Nokogiri rather than regexes or anything crazy to parse the text.

http://www.nokogiri.org/tutorials/parsing_an_html_xml_document.html

## Rakefile
require 'nokogiri'
require 'json'

POPULATION = 'List_of_countries_and_dependencies_by_population'
ISO_CODES = 'ISO_3166-1'

COUNTRY_CODES = {}
COUNTRIES = []

def make_country(code, name, population)
  COUNTRIES << {
    code: code,
    name: name,
    population: population
  }
end

def download_wikipedia_page(page_name)
  unless File.exists?(Dir.pwd + "/#{page_name}")
    sh "wget https://en.wikipedia.org/wiki/#{page_name}"
  end
end

def load_doc(page_name)
  Nokogiri::HTML(File.read(Dir.pwd + "/#{page_name}"))
end

def write_file(name, data)
  File.open(name, 'w') do |file|
    file.puts(data)
  end
end

task :download do
  download_wikipedia_page(POPULATION)
  download_wikipedia_page(ISO_CODES)
end

task extract_codes: :download do
  doc = load_doc(ISO_CODES)
  doc.css('table.wikitable tr').each do |tr|
    name = tr.css('td:nth-child(1)').text
    code = tr.css('td:nth-child(5)').text

    if name && code
      COUNTRY_CODES[name] = code.split(':').last
    end
  end
end

task extract_population: :extract_codes do
  doc = load_doc(POPULATION)
  doc.css('table.wikitable tr').each do |tr|
    link_titles = tr.css('a[title]')

    case link_titles.count
    when 1
      name = tr.css('a[title]').text
    when 2
      name = tr.css('a[title]').first.text
    end

    if name
      population = tr.css('td:nth-child(3)').text.gsub(',', '').to_i
      make_country(COUNTRY_CODES[name], name, population) if COUNTRY_CODES.key?(name)
    end
  end
end

EOL = "\n".freeze

task build_csv: :extract_population do
  COUNTRIES_CSV = COUNTRIES.inject('code,name,population') do |csv, country|
    csv << "#{country[:code]},#{country[:name]},#{country[:population]}" + EOL
  end
end

task build_json: :extract_population do
  COUNTRIES_JSON = JSON.pretty_generate(COUNTRIES)
end

task print_csv: :build_csv do
  puts COUNTRIES_CSV
end

task csv: :build_csv do
  write_file('countries.csv', COUNTRIES_CSV)
end

task json: :build_json do
  write_file('countries.json', COUNTRIES_JSON)
end

task print_json: :build_json do
  puts COUNTRIES_JSON
end
	http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population

	Start by making a new project with a Rakefile (no Rails!)

	We will pretend that the list/table might be changing all the time, so we want something repeatable to extract the text data from it

	Write Rake tasks to download the HTML, extract the list of countries by population, and generate both JSON and CSV files.

	No backend stuff, no browser stuff. Just a pure Ruby command line script.

	The extra challenge is to scrape a separate table and merge the ISO country codes to each item

	http://en.wikipedia.org/wiki/ISO_3166-1

	I recommend using Nokogiri rather than regexes or anything crazy to parse the text.

	http://www.nokogiri.org/tutorials/parsing_an_html_xml_document.html
	require 'nokogiri'
	require 'json'

	POPULATION = 'List_of_countries_and_dependencies_by_population'
	ISO_CODES = 'ISO_3166-1'

	COUNTRY_CODES = {}
	COUNTRIES = []

	def make_country(code, name, population)
	COUNTRIES << {
	code: code,
	name: name,
	population: population
	}
	end

	def download_wikipedia_page(page_name)
	unless File.exists?(Dir.pwd + "/#{page_name}")
	sh "wget https://en.wikipedia.org/wiki/#{page_name}"
	end
	end

	def load_doc(page_name)
	Nokogiri::HTML(File.read(Dir.pwd + "/#{page_name}"))
	end

	def write_file(name, data)
	File.open(name, 'w') do \|file\|
	file.puts(data)
	end
	end

	task :download do
	download_wikipedia_page(POPULATION)
	download_wikipedia_page(ISO_CODES)
	end

	task extract_codes: :download do
	doc = load_doc(ISO_CODES)
	doc.css('table.wikitable tr').each do \|tr\|
	name = tr.css('td:nth-child(1)').text
	code = tr.css('td:nth-child(5)').text

	if name && code
	COUNTRY_CODES[name] = code.split(':').last
	end
	end
	end

	task extract_population: :extract_codes do
	doc = load_doc(POPULATION)
	doc.css('table.wikitable tr').each do \|tr\|
	link_titles = tr.css('a[title]')

	case link_titles.count
	when 1
	name = tr.css('a[title]').text
	when 2
	name = tr.css('a[title]').first.text
	end

	if name
	population = tr.css('td:nth-child(3)').text.gsub(',', '').to_i
	make_country(COUNTRY_CODES[name], name, population) if COUNTRY_CODES.key?(name)
	end
	end
	end

	EOL = "\n".freeze

	task build_csv: :extract_population do
	COUNTRIES_CSV = COUNTRIES.inject('code,name,population') do \|csv, country\|
	csv << "#{country[:code]},#{country[:name]},#{country[:population]}" + EOL
	end
	end

	task build_json: :extract_population do
	COUNTRIES_JSON = JSON.pretty_generate(COUNTRIES)
	end

	task print_csv: :build_csv do
	puts COUNTRIES_CSV
	end

	task csv: :build_csv do
	write_file('countries.csv', COUNTRIES_CSV)
	end

	task json: :build_json do
	write_file('countries.json', COUNTRIES_JSON)
	end

	task print_json: :build_json do
	puts COUNTRIES_JSON
	end