Skip to content

Instantly share code, notes, and snippets.

@dummied
Created July 1, 2009 12:36
Show Gist options
  • Save dummied/138759 to your computer and use it in GitHub Desktop.
Save dummied/138759 to your computer and use it in GitHub Desktop.
require 'digest/md5'
class GeoUtil
COMMON_PLACE_SUFFIXES = [
'[sS]treet',
'[sS]t\.',
'[aA]ve\.?',
'[aA]venue',
'[dD]r\.?',
'[dD]rive',
'[rR]d\.?',
'[rR]oad',
'[bB]lvd\.?',
'[bB]oulevard',
'[cC]ircle',
'[pP]kwy\.?',
'[pP]arkway',
'[lL]ane',
'[wW]ay'
]
COMMON_DIRECTIONS = [
'N\.?',
'E\.?',
'S\.?',
'W\.?',
'NW\.?',
'NE\.?',
'SW\.?',
'SE\.?',
'[nN]orth(\-?([wWest]|[eEast]))?',
'[sS]outh(\-?([wWest]|[eEast]))?',
'[eE]ast',
'[wW]est'
]
NUMERIC_QUANTITIES = [
'(?i:seconds)',
'(?i:minutes?)',
'(?i:hours?)',
'(?i:years?)',
'(?i:times?)',
'(?i:ways?)'
]
IGNORE_NEXT_NUMERIC_QUANTITY = '(?!(' + NUMERIC_QUANTITIES.join('|') + '))'
END_OF_SUBJECT = '(?=\s|$|[^\w])'
WORD = '[a-zA-Z0-9_\-\"#\.]+'
DATELINE_TERMINATORS = [ '\-',
'\—'
]
DATELINE_TERMINATORS_CLASS = DATELINE_TERMINATORS.join("")
DTERM = DATELINE_TERMINATORS_CLASS
PROPER_NOUN = "[A-Z][^0-9#{DTERM}\\s\\(\\)]+"
DATELINE_CITY = "[A-Z][^0-9a-z\\,#{DTERM}\\(\\)]+"
DATELINE_REGION = "(#{PROPER_NOUN}\\s)*#{PROPER_NOUN}"
DATELINE_PUBLISHED_BY = '\([^\)]+\)'
BY_STMT = '(?:By.+?)'
DATELINE = /\A#{BY_STMT}?(#{DATELINE_CITY})(\,\s?(#{DATELINE_REGION})\s*)?(\s*#{DATELINE_PUBLISHED_BY}\s*)?[#{DTERM}]+\s*[A-Z"']/
COMMON_REGIONS = {
"NEW YORK" => "NY",
"WASHINGTON" => "DC",
"LOS ANGELES" => "CA",
"HOLLYWOOD" => "CA",
"BAGHDAD" => "IRAQ",
"JERUSALEM" => "ISRAEL",
"INDIANAPOLIS" => "IN",
"CHICAGO" => "IL",
"MOSCOW" => "RUSSIA",
"TEHRAN" => "IRAN",
"PARIS" => "FRANCE",
"LONDON" => "UK",
"BEIJING" => "CHINA",
"AMSTERDAM" => "NETHERLANDS",
"ROME" => "ITALY",
"LITTLE ROCK" => "AR",
"DALLAS" => "TX",
"SANTA FE" => "NM",
"OSLO" => "NORWAY",
"DUBLIN" => "IRELAND",
"TOKYO" => "JAPAN",
"GLASGOW" => "SCOTLAND",
"HOUSTON" => "TX",
"PHILADELPHIA" => "PA",
"ATLANTA" => "GA",
"DETROIT" => "MI",
"PITTSBURGH" => "PA",
"SEATTLE" => "WA",
"SAN FRANCISCO" => "CA",
"SAN ANTONIO" => "TX",
"OKLAHOMA CITY" => "OK",
"NEW ORLEANS" => "LA",
"MINNEAPOLIS" => "MN",
"CINCINNATI" => "OH",
"MEXICO CITY" => "MEXICO",
"SALT LAKE CITY" => "UT",
"HONOLULU" => "HAWAII",
"PHOENIX" => "AZ",
"CLEVELAND" => "OH",
"UNITED NATIONS" => "NY",
"HAVANA" => "CUBA"
}
COUNTRIES = ["Afghanistan", "Aland Islands", "Albania", "Algeria", "American Samoa", "Andorra", "Angola",
"Anguilla", "Antarctica", "Antigua And Barbuda", "Antigua", "Barbuda", "Argentina", "Armenia", "Aruba", "Australia", "Austria",
"Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin",
"Bermuda", "Bhutan", "Bolivia", "Bosnia and Herzegowina", "Bosnia", "Herzegowina", "Botswana", "Bouvet Island", "Brazil",
"British Indian Ocean Territory", "Brunei Darussalam", "Bulgaria", "Burkina Faso", "Burundi", "Cambodia",
"Cameroon", "Canada", "Cape Verde", "Cayman Islands", "Central African Republic", "Chad", "Chile", "China",
"Christmas Island", "Cocos Islands", "Keeling Islands", "Colombia", "Comoros", "Congo",
"the Democratic Republic of the Congo", "Cook Islands", "Costa Rica", "Cote d'Ivoire", "Croatia", "Cuba",
"Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt",
"El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Falkland Islands", "Malvinas",
"Faroe Islands", "Fiji", "Finland", "France", "French Guiana", "French Polynesia",
"French Southern Territories", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Gibraltar", "Greece",
"Greenland", "Grenada", "Guadeloupe", "Guam", "Guatemala", "Guernsey", "Guinea",
"Guinea-Bissau", "Guyana", "Haiti", "Heard and McDonald Islands", "Heard Islands", "McDonald Islands",
"Holy See", "Vatican City State", "Vatican City",
"Honduras", "Hong Kong", "Hungary", "Iceland", "India", "Indonesia", "Islamic Republic of Iran", "Iran", "Iraq",
"Ireland", "Isle of Man", "Israel", "Italy", "Jamaica", "Japan", "Jersey", "Jordan", "Kazakhstan", "Kenya",
"Kiribati", "Democratic People's Republic of Korea", "Republic of Korea", "Kuwait", "Kyrgyzstan",
"Lao People's Democratic Republic", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libyan Arab Jamahiriya",
"Liechtenstein", "Lithuania", "Luxembourg", "Macao", "The Former Yugoslav Republic Of Macedonia", "Macedonia",
"Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Martinique",
"Mauritania", "Mauritius", "Mayotte", "Mexico", "Federated States of Micronesia", "Micronesia", "Republic of Moldova", "Moldova",
"Monaco", "Mongolia", "Montenegro", "Montserrat", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru",
"Nepal", "Netherlands", "Netherlands Antilles", "New Caledonia", "New Zealand", "Nicaragua", "Niger",
"Nigeria", "Niue", "Norfolk Island", "Northern Mariana Islands", "Norway", "North Korea", "Oman", "Pakistan", "Palau",
"Occupied Palestinian Territory", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
"Pitcairn", "Poland", "Portugal", "Puerto Rico", "Qatar", "Reunion", "Romania", "Russian Federation",
"Rwanda", "Saint Barthelemy", "Saint Helena", "Saint Kitts and Nevis", "Saint Lucia",
"Saint Pierre and Miquelon", "Saint Pierre", "Miquelon", "Saint Vincent and the Grenadines",
"Saint Vincent", "Grenadines", "Samoa", "San Marino",
"Sao Tome and Principe", "Sao Tome", "Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore",
"Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa",
"South Georgia and the South Sandwich Islands", "South Georgia", "South Sandwich Islands", "Spain",
"South Korea", "Sri Lanka", "Sudan", "Suriname",
"Svalbard and Jan Mayen", "Svalbard", "Jan Mayen", "Swaziland", "Sweden", "Switzerland", "Syrian Arab Republic", "Syria",
"Taiwan, Province of China", "Taiwan", "Tajikistan", "United Republic of Tanzania", "Tanzania", "Thailand", "Timor-Leste",
"Togo", "Tokelau", "Tonga", "Trinidad and Tobago", "Trinidad", "Tobago", "Tunisia", "Turkey", "Turkmenistan",
"Turks and Caicos Islands", "Turks Islands", "Caicos Islands", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom",
"United States", "United States Minor Outlying Islands", "Uruguay", "Uzbekistan", "Vanuatu", "Venezuela",
"Viet Nam", "Vietnam", "British Virgin Islands", "Virgin Islands", "US Virgin Islands", "Wallis and Futuna",
"Wallis", "Futuna", "Western Sahara", "Yemen", "Zambia", "Zimbabwe"]
def self.coordinates_from_text(body)
body = body.dup
body.gsub!(/<[^>]+>/,"")
body.strip! # not interested in surrounding ws or html tags
numbers = '[0-9]+'
safe_numbers = '(' + numbers + ' ' + IGNORE_NEXT_NUMERIC_QUANTITY + ')'
joined_common_place_suffixes = '(' + COMMON_PLACE_SUFFIXES.join("|") + ')'
street_token = WORD + '\s'
street_tokens = '(' + street_token + ')+?'
qualified_street_names = '(' + safe_numbers + street_tokens + joined_common_place_suffixes + ')'
directional_names = '(' + COMMON_DIRECTIONS.join('|') + ')'
inexact_street_name = '(([A-Z0-9]' + WORD + '\s)*([A-Z0-9]' + WORD + '))'
directional_street_names = '(' + safe_numbers + '? ' + directional_names + ' ' + inexact_street_name + ' ' + joined_common_place_suffixes + '?)'
either_qualified_or_directional_names = '(' + directional_street_names + '|' + qualified_street_names + ')'
not_prefixed_by_number_separator = '(?<!\d(?:\.|,))'
towns = Towns.towns.join('|')
towns_named_group = '(\s*(,|in|at)\s*(?<town>((?i:' + towns + ')|([A-Z][a-z\-]+\s?)+)))?'
detect_regex = '(' + not_prefixed_by_number_separator +
either_qualified_or_directional_names +
towns_named_group +
END_OF_SUBJECT + ')'
found = []
if body =~ DATELINE
default_city = $1.strip
unless (COUNTRIES.any? { |c| c.downcase == default_city.downcase } && $2.blank?)
f2 = $2.strip.gsub(/^,\s*/, "") if $2
default_region = f2 || COMMON_REGIONS[default_city] || "IN"
default_region.strip!
locality = "#{default_city}, #{default_region}"
else
locality = default_region = default_city
default_city = ""
end
res = self.coordinates_from_address(locality)
res = res.first if res.is_a? Array
if res && !res.lat.nil?
found << [locality, res]
end
else
default_city = "Indianapolis"
default_region = "IN"
end
if matches = Oniguruma::ORegexp.new(detect_regex).scan(body)
matches.each do |m|
town = m[:town] || default_city
locality = m.to_s
locality += ", #{town}" unless town.blank?
locality += ", #{default_region}" unless default_region.blank?
res = self.coordinates_from_address(locality)
res = res.first if res.is_a? Array
if res && !res.lat.nil?
found << [m.to_s, res]
end
end
end
found
end
def self.coordinates_from_address(address, *args)
options = args.extract_options!
ckey = Digest::MD5.hexdigest("GU_#{address.to_s}".gsub(/\s/, '_'))
val = Rails.cache.read(ckey) rescue nil
if options[:force] || val.nil?
val = GeoKit::Geocoders::MultiGeocoder.geocode(address)
end
Rails.cache.write(ckey, val)
val
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment