Skip to content

Instantly share code, notes, and snippets.

@iloveitaly
Created August 19, 2011 17:09
Show Gist options
  • Save iloveitaly/1157372 to your computer and use it in GitHub Desktop.
Save iloveitaly/1157372 to your computer and use it in GitHub Desktop.
Normalizing Addresses With Google Refine + Maps
# eliminate stop words and create base name from title
import re
import urllib2
stop_words = urllib2.urlopen("http://www.textfixer.com/resources/common-english-words.txt").read().split(",") + ["cds", "mp", "cd", "dvd"]
words = re.sub(r"[^a-zA-Z]", ' ', value).lower().split(" ")
existing = set()
return "_".join([i for i in words if len(i) > 1 and not i in stop_words and i != "" and not i in existing and not existing.add(i) and (len(existing) < 5 or len("_".join(existing)) < 35) ])

Note: this has only been tested on US addresses ###Grap GMap Data###

"http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=" + escape(
	cells["address"].value.trim() + " " +
	
	if(hasField(cells, "city"),
		cells["city"].value.trim() + " ",
		""
	) +
	
	if(hasField(cells, "state"),
		cells["state"].value.trim() + " ",
		""
	) +
	
	if(type(cells["zip"].value) == "number",
		cells["zip"].value.toString()[0, 5],
		cells["zip"].value
	),
"url")

###Parse GMap Data### Note: Tricky part here is postal codes... GMap strips out the PO box

if(
	cells["address"].value.trim().replace(/^[POpo.]{2,4}/, "PO").startsWith("PO"),
		cells["address"].value.trim().replace(/^[POpo.]{2,4}/, "P.O.") + ", " + if(length(value.parseJson()["results"]) > 1,
			value.parseJson()["results"][1]["formatted_address"],
			if(length(value.parseJson()["results"][0]["address_components"]) == 4,
				value.parseJson()["results"][0]["formatted_address"].replace(/([A-Z]{2}),/, "$1 " + cells["zip"].value + ","),
				value.parseJson()["results"][0]["formatted_address"]
			)
		),
		value.parseJson()["results"][0]["formatted_address"]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment