Skip to content

Instantly share code, notes, and snippets.

@dportabella
Last active March 24, 2017 19:14
Show Gist options
  • Save dportabella/d3ec64c3907daeaa00c086a967a2c67b to your computer and use it in GitHub Desktop.
Save dportabella/d3ec64c3907daeaa00c086a967a2c67b to your computer and use it in GitHub Desktop.
compute distance in km between two postal codes
// using build.sbt: libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % "provided"
// using Ammonite: import $ivy.`org.apache.sis.core:sis-referencing:0.7`, org.apache.sis.distance.DistanceUtils
case class Coordinates(lat: Double, lon: Double)
def readCoordinates(file: String): Map[String, Coordinates] = {
def parseLine(line: String): (String, Coordinates) = {
val c = line.split("\t")
(c(0) + "-" + c(1), Coordinates(c(9).toDouble, c(10).toDouble))
}
// there are some duplicated postal codes. let's take one at random.
// scala.io.Source.fromFile(filename).getLines.map(f).toList.groupBy(_._1).filter(_._2.size > 1).foreach(println)
scala.io.Source.fromFile(coordinatesFile).getLines.map(parseLine).toMap
}
def distanceInKm(c1: Coordinates, c2: Coordinates): Double =
DistanceUtils.getHaversineDistance(c1.lat, c1.lon, c2.lat, c2.lon)
def distanceInKm(postalCode1: String, postalCode2: String): Double =
distanceInKm(coordinates(postalCode1), coordinates(postalCode2))
// unzip from: http://download.geonames.org/export/zip/allCountries.zip
val coordinates: Map[String, Coordinates] = readCoordinates("./allCountries.txt")
println(coordinates.size) // 612928
coordinates.take(3).foreach(println)
/*
(PT-2950-316,Coordinates(38.569,-8.9013))
(US-57006,Coordinates(44.3056,-96.7914))
(PL-85-455,Coordinates(53.15,18.0))
*/
println(distanceInKm("CH-1200", "CH-1004")) // Geneva - Lausanne: 50.37 Km
println(distanceInKm("ES-08001", "CH-1004")) // Barcelona - Lausanne: 673.56 Km
val queryFile = "./query.txt"
scala.io.Source.fromFile(queryFile).getLines.foreach { line =>
val Array(postalCode1, postalCode2) = line.split("\t")
println(postalCode1 + "\t" + postalCode2 + "\t" + distanceInKm(postalCode1, postalCode2))
}
/*
FILE query.txt
CH-1200 CH-1004
ES-08001 CH-1004
...
FILE output:
CH-1200 CH-1004 50.37020530697646
ES-08001 CH-1004 673.566602548844
...
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment