Skip to content

Instantly share code, notes, and snippets.

@andelink
Last active November 5, 2021 17:57
Show Gist options
  • Save andelink/df12a0b81f6b9d213e309a9f6df85029 to your computer and use it in GitHub Desktop.
Save andelink/df12a0b81f6b9d213e309a9f6df85029 to your computer and use it in GitHub Desktop.
Haversine Distance Spark SQL
"""
https://gist.github.com/pavlov99/bd265be244f8a84e291e96c5656ceb5c
"""
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
CITIES = [
('HKG', 'Hong Kong', 22.308919, 113.914603),
('SYD', 'Sydney', -33.946111, 151.177222),
('YYZ', 'Toronto', 43.677223, -79.630556),
]
AVG_EARTH_RADIUS = 6371.0
def haversine(lat1, lng1, lat2, lng2):
lat1 = F.radians(lat1)
lng1 = F.radians(lng1)
lat2 = F.radians(lat2)
lng2 = F.radians(lng2)
lat = lat2 - lat1
lng = lng2 - lng1
d = F.sin(lat * 0.5) ** 2 + F.cos(lat1) * F.cos(lat2) * F.sin(lng * 0.5) ** 2
return 2 * AVG_EARTH_RADIUS * F.asin(F.sqrt(d))
spark = SparkSession.builder.master('local[*]').getOrCreate()
a = spark.createDataFrame(CITIES, ['airport', 'city', 'lat', 'lng']).alias('a')
b = spark.createDataFrame(CITIES, ['airport', 'city', 'lat', 'lng']).alias('b')
a.show()
a_ = F.struct('a.*')
b_ = F.struct('b.*')
(
a.join(b, a.airport != b.airport)
.select(
F.least(a_, b_).alias('a'),
F.greatest(a_, b_).alias('b'),
haversine(a.lat, a.lng, b.lat, b.lng).alias('distance')
)
.distinct()
.show(truncate=False)
)
>>>
+-------+---------+----------+----------+
|airport| city| lat| lng|
+-------+---------+----------+----------+
| HKG|Hong Kong| 22.308919|113.914603|
| SYD| Sydney|-33.946111|151.177222|
| YYZ| Toronto| 43.677223|-79.630556|
+-------+---------+----------+----------+
+---------------------------------------+-------------------------------------+------------------+
|a |b |distance |
+---------------------------------------+-------------------------------------+------------------+
|{HKG, Hong Kong, 22.308919, 113.914603}|{SYD, Sydney, -33.946111, 151.177222}|7393.8837884771565|
|{HKG, Hong Kong, 22.308919, 113.914603}|{YYZ, Toronto, 43.677223, -79.630556}|12548.533187172497|
|{SYD, Sydney, -33.946111, 151.177222} |{YYZ, Toronto, 43.677223, -79.630556}|15554.728375861841|
+---------------------------------------+-------------------------------------+------------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment