Skip to content

Instantly share code, notes, and snippets.

@ingenieroariel
Created September 27, 2023 21:03
Show Gist options
  • Save ingenieroariel/2ef747074efcb1b62d7589e125603f32 to your computer and use it in GitHub Desktop.
Save ingenieroariel/2ef747074efcb1b62d7589e125603f32 to your computer and use it in GitHub Desktop.
{ pkgs, ... }:
{
packages = with pkgs; [
git
google-cloud-sdk
(python311.withPackages(ps: with ps; [
pyarrow
duckdb
h3
]))
];
scripts.google.exec = ''gsutil -m rsync -avhP gs://open-buildings-data/v3/points_s2_level_4_gzip $1'';
scripts.arrow.exec = ''time python -W ignore google_to_arrow.py $1 $2'';
scripts.pstac.exec = ''echo "psych"'';
enterShell = ''
git --version
'';
}
import os
import sys
import glob
import subprocess
import multiprocessing
import pyarrow as pa
import duckdb as db
import h3.unstable.vect as vect
def process_file(filename):
print(f"Processing {filename}")
b = db.sql(f"SELECT latitude, longitude FROM '{filename}'").arrow()
lats = b["latitude"].combine_chunks().to_numpy()
lons = b["longitude"].combine_chunks().to_numpy()
h3_15 = vect.geo_to_h3(lats, lons, 15)
h3_7 = vect.h3_to_parent(h3_15, 7)
h3_15_arrow = pa.array(h3_15, type=pa.uint64())
h3_7_arrow = pa.array(h3_7, type=pa.uint64())
table = pa.table({'h3_15': h3_15_arrow, 'h3_7': h3_7_arrow})
out_dir = sys.argv[2]
os.makedirs(out_dir, exist_ok=True)
target_file = os.path.join(out_dir, os.path.basename(filename)) + ".arrow"
with pa.OSFile(target_file, "wb") as sink:
with pa.RecordBatchFileWriter(sink, table.schema) as writer:
writer.write_table(table)
print(f"Wrote {target_file}")
if __name__ == '__main__':
google_dir = sys.argv[1]
print(f"Entering {google_dir}")
with multiprocessing.Pool(multiprocessing.cpu_count() - 2) as processing_pool:
processing_pool.map(process_file, glob.glob(f"{google_dir}/*.csv.gz"))
import duckdb as db
import pandas as pd
import pyarrow as pa
from glob import glob
file_paths = glob.glob("/Users/x/data/points_arrow_h3_15_and_7/*")
tm = lambda pa_dtype : pd.ArrowDtype(pa_dtype)
# Open and read each file, storing the tables in a list
tables = []
for path in file_paths:
with pa.ipc.open_file(path) as f:
tables.append(f.read_all().to_pandas(types_mapper=tm)[["h3_15"]])
df = pd.concat(tables)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment