Created
September 27, 2012 06:27
-
-
Save melborne/3792505 to your computer and use it in GitHub Desktop.
Gviz sample: U.S. States
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require "nokogiri" | |
require "open-uri" | |
module Gviz | |
class Scraper | |
class << self | |
def build(path) | |
parse get(path) | |
end | |
def get(url) | |
Nokogiri::HTML(open url) | |
rescue OpenURI::HTTPError => e | |
STDERR.puts "HTTP Access Error:#{e}" | |
exit | |
end | |
def parse(html) | |
q = [] | |
html.css("table.wikitable tr").each do |tr| | |
q << tr | |
end | |
q | |
end | |
end | |
end | |
end | |
if __FILE__ == $0 | |
url = "http://en.wikipedia.org/wiki/List_of_U.S._states" | |
header, *body = Gviz::Scraper.build(url) | |
p header.css('th').map(&:text) | |
p body.map { |tr| tr.css('td').map(&:text) } | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://en.wikipedia.org/wiki/List_of_U.S._states | |
HEADER = ["Name", "IPA", "USPS", "Flag", "Statehood", "Area (sq mi)", "Population (2011[update])", "Capital", "Most populous city", "Preceding entity", "GDP\n($millions)\n"] | |
BODY = [["Alabama", "/ˌæləˈbæmə/", "AL", "", "01819-12-14December 14, 1819", "&1000000000013576500000052,419 sq mi (135,765 km2)", "4,802,740", "Montgomery", "Birmingham", "Alabama Territory", "174,400"], ["Alaska", "/əˈlæskə/", "AK", "", "01959-01-03January 3, 1959", "&10000000001717854000000663,267 sq mi (1,717,854 km2)", "722,718", "Juneau", "Anchorage", "Alaska Territory", "45,600"], ["Arizona", "/ˌærɪˈzoʊnə/", "AZ", "", "01912-02-14February 14, 1912", "&10000000000295254000000113,998 sq mi (295,254 km2)", "6,482,505", "Phoenix", "Phoenix", "Arizona Territory", "261,300"], ["Arkansas", "/ˈɑrkənsɔː/", "AR", "", "01836-06-15June 15, 1836", "&1000000000013700200000052,897 sq mi (137,002 km2)", "2,937,979", "Little Rock", "Little Rock", "Arkansas Territory", "105,800"], ["California", "/ˌkælɪˈfɔrnjə/", "CA", "", "01850-09-09September 9, 1850", "&10000000000423970000000163,700 sq mi (423,970 km2)", "37,691,912", "Sacramento", "Los Angeles", "Directly admitted from Mexican Cession", "1,936,400"], ["Colorado", "/ˌkɒləˈrædoʊ/", "CO", "", "01876-08-01August 1, 1876", "&10000000000269837000000104,185 sq mi (269,837 km2)", "5,116,796", "Denver", "Denver", "Colorado Territory", "259,700"], | |
--- 中略 --- | |
["Washington", "/ˈwɒʃɪŋtən/", "WA", "", "01889-11-11November 11, 1889", "&1000000000018482700000071,362 sq mi (184,827 km2)", "6,830,038", "Olympia", "Seattle", "Washington Territory", "351,100"], ["West Virginia", "/ˌwɛst vərˈdʒɪnjə/", "WV", "", "01863-06-20June 20, 1863", "&1000000000006275500000024,230 sq mi (62,755 km2)", "1,855,364", "Charleston", "Charleston", "Divided off from Virginia without the consent of that state", "66,600"], ["Wisconsin", "/wɪsˈkɒnsɪn/", "WI", "", "01848-05-29May 29, 1848", "&1000000000016963900000065,498 sq mi (169,639 km2)", "5,711,767", "Madison", "Milwaukee", "Wisconsin Territory, formed from the Northwest Territory", "251,400"], ["Wyoming", "/waɪˈoʊmɪŋ/", "WY", "", "01890-07-10July 10, 1890", "&1000000000025334800000097,818 sq mi (253,348 km2)", "568,158", "Cheyenne", "Cheyenne", "Wyoming Territory", "38,200"]] |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 3 columns, instead of 6. in line 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
usps,name,links | |
AL,Alabama,MS,FL,GA,TN | |
AK,Alaska,WA | |
AZ,Arizona,CA,NV,UT,NM,CO | |
AR,Arkansas,MO,TN,MS,LA,TX,OK | |
CA,California,OR,NV,AZ | |
CO,Colorado,WY,NE,KS,OK,NM,AZ,UT | |
CT,Connecticut,NY,MA,RI | |
DE,Delaware,MD,PA,NJ | |
FL,Florida,AL,GA | |
GA,Georgia,FL,AL,TN,NC,SC | |
HI,Hawaii | |
ID,Idaho,MT,WY,UT,NV,OR,WA | |
IL,Illinois,WI,IN,KY,MO,IA | |
IN,Indiana,MI,OH,KY,IL | |
IA,Iowa,MN,WI,IL,MO,NE,SD | |
KS,Kansas,NE,MO,OK,CO | |
KY,Kentucky,MO,IL,IN,OH,WV,VA,TN | |
LA,Louisiana,TX,AR,MS | |
ME,Maine,NH | |
MD,Maryland,VA,WV,PA,DE | |
MA,Massachusetts,NY,VT,NH,RI,CT | |
MI,Michigan,IN,OH,WI | |
MN,Minnesota,WI,IA,SD,ND | |
MS,Mississippi,LA,AR,TN,AL | |
MO,Missouri,IA,IL,KY,TN,AR,OK,KS,NE | |
MT,Montana,ND,SD,WY,ID | |
NE,Nebraska,SD,IA,MO,KS,CO,WY | |
NV,Nevada,CA,OR,ID,UT,AZ | |
NH,New Hampshire,ME,MA,VT | |
NJ,New Jersey,DE,PA,NY | |
NM,New Mexico,AZ,UT,CO,OK,TX | |
NY,New York,PA,VT,MA,CT,NJ | |
NC,North Carolina,VA,TN,GA,SC | |
ND,North Dakota,MN,SD,MT | |
OH,Ohio,MI,IN,KY,WV,PA | |
OK,Oklahoma,KS,MO,AR,TX,NM,CO | |
OR,Oregon,WA,ID,NV,CA | |
PA,Pennsylvania,NY,NJ,DE,MD,WV,OH | |
RI,Rhode Island,MA,CT | |
SC,South Carolina,NC,GA | |
SD,South Dakota,ND,MN,IA,NE,WY,MT | |
TN,Tennessee,KY,VA,NC,GA,AR,MS,AL,MO | |
TX,Texas,NM,OK,AR,LA | |
UT,Utah,ID,WY,CO,NM,AZ,NV | |
VT,Vermont,NY,MA,NH | |
VA,Virginia,WV,KY,TN,NC,MD | |
WA,Washington,OR,ID,AK | |
WV,West Virginia,OH,PA,MD,VA,KY | |
WI,Wisconsin,MI,IL,IA,MN | |
WY,Wyoming,MT,SD,NE,CO,UT,ID |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require "csv" | |
require_relative "usdata" | |
_, *links = CSV.read('uslinks.csv') | |
links = links.map do |usps, name, *link| | |
[usps.intern, name, link.map(&:intern)] | |
end.sort | |
usdata = BODY.map do |name, ipa, usps, flag, sthood, area, pop, cap, popcity, pre, gdp| | |
name = name.sub(/\[\d+\]$/, '') | |
sthood = sthood[/\d{4}$/] | |
area = area[/(?<=\()[\d,]+/] | |
area = '858,927' if usps == 'AK' | |
[usps.intern, name, cap] + | |
[sthood, area, pop, gdp].map { |e| e.delete ',' }.map(&:to_i) | |
end.sort | |
usdata = usdata.zip(links).map { |data, links| data << links.last } | |
require "gviz" | |
era = <<EOS.lines.map { |line| Range.new *line.split('-').map(&:to_i) } | |
1776-1790 | |
1791-1799 | |
1800-1819 | |
1820-1839 | |
1840-1859 | |
1860-1879 | |
1880-1899 | |
1900-1950 | |
1950-1959 | |
EOS | |
def minmax(data, idx) | |
data.map{ |d| d[idx-1] }.minmax.tap { |minmax| break Range.new *minmax } | |
end | |
class Fixnum | |
def norm(current, target=1..10) | |
unit = (self - current.begin) / (current.end - current.begin).to_f | |
(unit * (target.end - target.begin) + target.begin).round | |
end | |
end | |
area_minmax = minmax(usdata, 5) | |
pop_minmax = minmax(usdata, 6) | |
gv = Gviz.new(:USA) | |
gv.graph do | |
global layout:'neato', overlap:false, label:'U.S. states', fontsize:92, size:32 | |
nodes colorscheme:'blues9', style:'filled', shape:'polygon', regular:true | |
edges arrowhead:'none' | |
usdata.each do |id, name, cap, sthood, area, pop, gdp, link| | |
sthood = 9 - era.index { |r| r.include? sthood } | |
area = area.norm(area_minmax, 1..6) | |
pop = pop.norm(pop_minmax, 4..12) | |
route id => link | |
node id, label: name, fillcolor:sthood, width:area, fontsize:14*area, sides:pop | |
node id, fontcolor:'white' if sthood > 7 | |
end | |
end | |
gv.save(:usa, :png) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment