Created
July 1, 2009 12:36
-
-
Save dummied/138759 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'digest/md5' | |
class GeoUtil | |
COMMON_PLACE_SUFFIXES = [ | |
'[sS]treet', | |
'[sS]t\.', | |
'[aA]ve\.?', | |
'[aA]venue', | |
'[dD]r\.?', | |
'[dD]rive', | |
'[rR]d\.?', | |
'[rR]oad', | |
'[bB]lvd\.?', | |
'[bB]oulevard', | |
'[cC]ircle', | |
'[pP]kwy\.?', | |
'[pP]arkway', | |
'[lL]ane', | |
'[wW]ay' | |
] | |
COMMON_DIRECTIONS = [ | |
'N\.?', | |
'E\.?', | |
'S\.?', | |
'W\.?', | |
'NW\.?', | |
'NE\.?', | |
'SW\.?', | |
'SE\.?', | |
'[nN]orth(\-?([wWest]|[eEast]))?', | |
'[sS]outh(\-?([wWest]|[eEast]))?', | |
'[eE]ast', | |
'[wW]est' | |
] | |
NUMERIC_QUANTITIES = [ | |
'(?i:seconds)', | |
'(?i:minutes?)', | |
'(?i:hours?)', | |
'(?i:years?)', | |
'(?i:times?)', | |
'(?i:ways?)' | |
] | |
IGNORE_NEXT_NUMERIC_QUANTITY = '(?!(' + NUMERIC_QUANTITIES.join('|') + '))' | |
END_OF_SUBJECT = '(?=\s|$|[^\w])' | |
WORD = '[a-zA-Z0-9_\-\"#\.]+' | |
DATELINE_TERMINATORS = [ '\-', | |
'\—' | |
] | |
DATELINE_TERMINATORS_CLASS = DATELINE_TERMINATORS.join("") | |
DTERM = DATELINE_TERMINATORS_CLASS | |
PROPER_NOUN = "[A-Z][^0-9#{DTERM}\\s\\(\\)]+" | |
DATELINE_CITY = "[A-Z][^0-9a-z\\,#{DTERM}\\(\\)]+" | |
DATELINE_REGION = "(#{PROPER_NOUN}\\s)*#{PROPER_NOUN}" | |
DATELINE_PUBLISHED_BY = '\([^\)]+\)' | |
BY_STMT = '(?:By.+?)' | |
DATELINE = /\A#{BY_STMT}?(#{DATELINE_CITY})(\,\s?(#{DATELINE_REGION})\s*)?(\s*#{DATELINE_PUBLISHED_BY}\s*)?[#{DTERM}]+\s*[A-Z"']/ | |
COMMON_REGIONS = { | |
"NEW YORK" => "NY", | |
"WASHINGTON" => "DC", | |
"LOS ANGELES" => "CA", | |
"HOLLYWOOD" => "CA", | |
"BAGHDAD" => "IRAQ", | |
"JERUSALEM" => "ISRAEL", | |
"INDIANAPOLIS" => "IN", | |
"CHICAGO" => "IL", | |
"MOSCOW" => "RUSSIA", | |
"TEHRAN" => "IRAN", | |
"PARIS" => "FRANCE", | |
"LONDON" => "UK", | |
"BEIJING" => "CHINA", | |
"AMSTERDAM" => "NETHERLANDS", | |
"ROME" => "ITALY", | |
"LITTLE ROCK" => "AR", | |
"DALLAS" => "TX", | |
"SANTA FE" => "NM", | |
"OSLO" => "NORWAY", | |
"DUBLIN" => "IRELAND", | |
"TOKYO" => "JAPAN", | |
"GLASGOW" => "SCOTLAND", | |
"HOUSTON" => "TX", | |
"PHILADELPHIA" => "PA", | |
"ATLANTA" => "GA", | |
"DETROIT" => "MI", | |
"PITTSBURGH" => "PA", | |
"SEATTLE" => "WA", | |
"SAN FRANCISCO" => "CA", | |
"SAN ANTONIO" => "TX", | |
"OKLAHOMA CITY" => "OK", | |
"NEW ORLEANS" => "LA", | |
"MINNEAPOLIS" => "MN", | |
"CINCINNATI" => "OH", | |
"MEXICO CITY" => "MEXICO", | |
"SALT LAKE CITY" => "UT", | |
"HONOLULU" => "HAWAII", | |
"PHOENIX" => "AZ", | |
"CLEVELAND" => "OH", | |
"UNITED NATIONS" => "NY", | |
"HAVANA" => "CUBA" | |
} | |
COUNTRIES = ["Afghanistan", "Aland Islands", "Albania", "Algeria", "American Samoa", "Andorra", "Angola", | |
"Anguilla", "Antarctica", "Antigua And Barbuda", "Antigua", "Barbuda", "Argentina", "Armenia", "Aruba", "Australia", "Austria", | |
"Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", | |
"Bermuda", "Bhutan", "Bolivia", "Bosnia and Herzegowina", "Bosnia", "Herzegowina", "Botswana", "Bouvet Island", "Brazil", | |
"British Indian Ocean Territory", "Brunei Darussalam", "Bulgaria", "Burkina Faso", "Burundi", "Cambodia", | |
"Cameroon", "Canada", "Cape Verde", "Cayman Islands", "Central African Republic", "Chad", "Chile", "China", | |
"Christmas Island", "Cocos Islands", "Keeling Islands", "Colombia", "Comoros", "Congo", | |
"the Democratic Republic of the Congo", "Cook Islands", "Costa Rica", "Cote d'Ivoire", "Croatia", "Cuba", | |
"Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", | |
"El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Falkland Islands", "Malvinas", | |
"Faroe Islands", "Fiji", "Finland", "France", "French Guiana", "French Polynesia", | |
"French Southern Territories", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Gibraltar", "Greece", | |
"Greenland", "Grenada", "Guadeloupe", "Guam", "Guatemala", "Guernsey", "Guinea", | |
"Guinea-Bissau", "Guyana", "Haiti", "Heard and McDonald Islands", "Heard Islands", "McDonald Islands", | |
"Holy See", "Vatican City State", "Vatican City", | |
"Honduras", "Hong Kong", "Hungary", "Iceland", "India", "Indonesia", "Islamic Republic of Iran", "Iran", "Iraq", | |
"Ireland", "Isle of Man", "Israel", "Italy", "Jamaica", "Japan", "Jersey", "Jordan", "Kazakhstan", "Kenya", | |
"Kiribati", "Democratic People's Republic of Korea", "Republic of Korea", "Kuwait", "Kyrgyzstan", | |
"Lao People's Democratic Republic", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libyan Arab Jamahiriya", | |
"Liechtenstein", "Lithuania", "Luxembourg", "Macao", "The Former Yugoslav Republic Of Macedonia", "Macedonia", | |
"Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Martinique", | |
"Mauritania", "Mauritius", "Mayotte", "Mexico", "Federated States of Micronesia", "Micronesia", "Republic of Moldova", "Moldova", | |
"Monaco", "Mongolia", "Montenegro", "Montserrat", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", | |
"Nepal", "Netherlands", "Netherlands Antilles", "New Caledonia", "New Zealand", "Nicaragua", "Niger", | |
"Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands", "Norway", "North Korea", "Oman", "Pakistan", "Palau", | |
"Occupied Palestinian Territory", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", | |
"Pitcairn", "Poland", "Portugal", "Puerto Rico", "Qatar", "Reunion", "Romania", "Russian Federation", | |
"Rwanda", "Saint Barthelemy", "Saint Helena", "Saint Kitts and Nevis", "Saint Lucia", | |
"Saint Pierre and Miquelon", "Saint Pierre", "Miquelon", "Saint Vincent and the Grenadines", | |
"Saint Vincent", "Grenadines", "Samoa", "San Marino", | |
"Sao Tome and Principe", "Sao Tome", "Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", | |
"Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", | |
"South Georgia and the South Sandwich Islands", "South Georgia", "South Sandwich Islands", "Spain", | |
"South Korea", "Sri Lanka", "Sudan", "Suriname", | |
"Svalbard and Jan Mayen", "Svalbard", "Jan Mayen", "Swaziland", "Sweden", "Switzerland", "Syrian Arab Republic", "Syria", | |
"Taiwan, Province of China", "Taiwan", "Tajikistan", "United Republic of Tanzania", "Tanzania", "Thailand", "Timor-Leste", | |
"Togo", "Tokelau", "Tonga", "Trinidad and Tobago", "Trinidad", "Tobago", "Tunisia", "Turkey", "Turkmenistan", | |
"Turks and Caicos Islands", "Turks Islands", "Caicos Islands", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", | |
"United States", "United States Minor Outlying Islands", "Uruguay", "Uzbekistan", "Vanuatu", "Venezuela", | |
"Viet Nam", "Vietnam", "British Virgin Islands", "Virgin Islands", "US Virgin Islands", "Wallis and Futuna", | |
"Wallis", "Futuna", "Western Sahara", "Yemen", "Zambia", "Zimbabwe"] | |
def self.coordinates_from_text(body) | |
body = body.dup | |
body.gsub!(/<[^>]+>/,"") | |
body.strip! # not interested in surrounding ws or html tags | |
numbers = '[0-9]+' | |
safe_numbers = '(' + numbers + ' ' + IGNORE_NEXT_NUMERIC_QUANTITY + ')' | |
joined_common_place_suffixes = '(' + COMMON_PLACE_SUFFIXES.join("|") + ')' | |
street_token = WORD + '\s' | |
street_tokens = '(' + street_token + ')+?' | |
qualified_street_names = '(' + safe_numbers + street_tokens + joined_common_place_suffixes + ')' | |
directional_names = '(' + COMMON_DIRECTIONS.join('|') + ')' | |
inexact_street_name = '(([A-Z0-9]' + WORD + '\s)*([A-Z0-9]' + WORD + '))' | |
directional_street_names = '(' + safe_numbers + '? ' + directional_names + ' ' + inexact_street_name + ' ' + joined_common_place_suffixes + '?)' | |
either_qualified_or_directional_names = '(' + directional_street_names + '|' + qualified_street_names + ')' | |
not_prefixed_by_number_separator = '(?<!\d(?:\.|,))' | |
towns = Towns.towns.join('|') | |
towns_named_group = '(\s*(,|in|at)\s*(?<town>((?i:' + towns + ')|([A-Z][a-z\-]+\s?)+)))?' | |
detect_regex = '(' + not_prefixed_by_number_separator + | |
either_qualified_or_directional_names + | |
towns_named_group + | |
END_OF_SUBJECT + ')' | |
found = [] | |
if body =~ DATELINE | |
default_city = $1.strip | |
unless (COUNTRIES.any? { |c| c.downcase == default_city.downcase } && $2.blank?) | |
f2 = $2.strip.gsub(/^,\s*/, "") if $2 | |
default_region = f2 || COMMON_REGIONS[default_city] || "IN" | |
default_region.strip! | |
locality = "#{default_city}, #{default_region}" | |
else | |
locality = default_region = default_city | |
default_city = "" | |
end | |
res = self.coordinates_from_address(locality) | |
res = res.first if res.is_a? Array | |
if res && !res.lat.nil? | |
found << [locality, res] | |
end | |
else | |
default_city = "Indianapolis" | |
default_region = "IN" | |
end | |
if matches = Oniguruma::ORegexp.new(detect_regex).scan(body) | |
matches.each do |m| | |
town = m[:town] || default_city | |
locality = m.to_s | |
locality += ", #{town}" unless town.blank? | |
locality += ", #{default_region}" unless default_region.blank? | |
res = self.coordinates_from_address(locality) | |
res = res.first if res.is_a? Array | |
if res && !res.lat.nil? | |
found << [m.to_s, res] | |
end | |
end | |
end | |
found | |
end | |
def self.coordinates_from_address(address, *args) | |
options = args.extract_options! | |
ckey = Digest::MD5.hexdigest("GU_#{address.to_s}".gsub(/\s/, '_')) | |
val = Rails.cache.read(ckey) rescue nil | |
if options[:force] || val.nil? | |
val = GeoKit::Geocoders::MultiGeocoder.geocode(address) | |
end | |
Rails.cache.write(ckey, val) | |
val | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment