Skip to content

Instantly share code, notes, and snippets.

@joshmoore
Created February 28, 2024 15:02
Show Gist options
  • Save joshmoore/715b6cb74e74fce4feac7c610eef4d96 to your computer and use it in GitHub Desktop.
Save joshmoore/715b6cb74e74fce4feac7c610eef4d96 to your computer and use it in GitHub Desktop.
GBIF Occurrence Map Generation
from pyspark.sql import SQLContext
from pyspark.sql.functions import pow, lit
from pyspark.sql.types import LongType
from pyspark.sql.functions import col, SparkContext
# Primary argument: how much to truncate the long & lat
num_places = 1
# https://data-blog.gbif.org/post/aws-and-gbif/
# Download and unzip one of the files
sc = SparkContext("local", "test")
sqlContext = SQLContext(sc)
gbif_snapshot_path = "occurrence.parquet/*"
df = sqlContext.read.parquet(gbif_snapshot_path)
export_df = df.select("decimallatitude", "decimallongitude")
m = pow(lit(10), num_places).cast(LongType())
trunc_df = (
export_df.withColumn("lat", (col("decimallatitude") * m).cast(LongType()) / m)
.withColumn("long", (col("decimallongitude") * m).cast(LongType()) / m)
.drop("decimallatitude")
.drop("decimallongitude")
)
trunc_df.write.mode("overwrite").parquet("export")
from pyspark.sql import SQLContext
from pyspark.sql.functions import SparkContext
# Filter the coordinates and add a count column
sc = SparkContext("local", "test")
sqlContext = SQLContext(sc)
gbif_snapshot_path = "export/*"
df = sqlContext.read.parquet(gbif_snapshot_path)
cleanup = "!((lat == 0.0 and long == 0.0) or (lat is NULL and long is NULL))"
counts = (
df.filter(cleanup)
.groupBy("lat", "long")
.count()
)
counts.write.mode("overwrite").parquet("counts")
import glob
import pandas as pd
import hvplot.pandas # noqa
import holoviews as hv
# This assumes the data has been sufficiently reduced
f = glob.glob("counts/*.parquet")
df = pd.read_parquet(f)
dmap = df.hvplot.scatter(
x="long", y="lat", rasterize=True, cnorm="eq_hist"
) # , cmap="gouldian")
hv.save(dmap, fmt="png", filename="map.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment