Skip to content

Instantly share code, notes, and snippets.

@fuwiak
Created December 4, 2022 00:54
Show Gist options
  • Save fuwiak/8441842fb839196173e5e30f739e48ae to your computer and use it in GitHub Desktop.
Save fuwiak/8441842fb839196173e5e30f739e48ae to your computer and use it in GitHub Desktop.
import pyspark
#import udf
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
from shapely.geometry import Point, Polygon
# Create a SparkContext
sc = pyspark.SparkContext()
# Create a SparkSession
spark = pyspark.sql.SparkSession(sc)
#read the data
df = spark.read.csv('export.csv', header=True, inferSchema=True)
#get LAT and LON columns
# temp = df.select('position_lat', 'position_lon')
#
# temp.show()
# POLYGON((-85.6795 42.7345 , -85.6785 42.7345 , -85.6785 42.7338 , -85.6795 42.7338 , -85.6795 42.7345))'
coords = [(-85.6795, 42.7345), (-85.6785, 42.7345), (-85.6785, 42.7338), (-85.6795, 42.7338), (-85.6795, 42.7345)]
poly = Polygon(coords)
#check if the point is within the polygon
def check_within_polygon(lat, lon):
point = Point(lon, lat)
return point.within(poly)
#register the function as a UDF
check_within_polygon_udf = udf(check_within_polygon, BooleanType())
#apply the UDF to the dataframe
task1 = df.withColumn('within_polygon', check_within_polygon_udf('position_lat', 'position_lon'))
#select position_lat, position_lon, within_polygon
task1.select('position_lat', 'position_lon', 'within_polygon').show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment