Skip to content

Instantly share code, notes, and snippets.

@cholmes
Last active January 16, 2024 23:12
Show Gist options
  • Save cholmes/d6dc942a206fe37bd1b26ce8a7b83a03 to your computer and use it in GitHub Desktop.
Save cholmes/d6dc942a206fe37bd1b26ce8a7b83a03 to your computer and use it in GitHub Desktop.
import duckdb
import pandas as pd
# Function to count records with 'area_in_meters' over 1000
def count_large_areas(parquet_url):
# Connect to DuckDB
conn = duckdb.connect(database=':memory:')
# Read the remote Parquet file
buildings_df = conn.execute(f"SELECT * FROM read_parquet('{parquet_url}')").fetchdf()
# Count records where 'area_in_meters' > 1000
count = buildings_df[buildings_df['area_in_meters'] > 1000].shape[0]
# Close the connection
conn.close()
return count
# URL of the remote Parquet file
parquet_url = 'https://data.source.coop/cholmes/google-open-buildings/geoparquet-by-country/country_iso=AG/AG.parquet'
# Get the count and print it
record_count = count_large_areas(parquet_url)
print(f"Number of records with 'area_in_meters' > 1000: {record_count}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment