Skip to content

Instantly share code, notes, and snippets.

@deanm0000
Created June 24, 2024 20:58
Show Gist options
  • Save deanm0000/a8b72958b02f2981537ff7a3feac8f98 to your computer and use it in GitHub Desktop.
Save deanm0000/a8b72958b02f2981537ff7a3feac8f98 to your computer and use it in GitHub Desktop.
import httpx
import asyncio
from bs4 import BeautifulSoup
import os
import geopandas as gpd
import pandas as pd
from pathlib import Path
from geoarrow.rust.core import (
GeoTable,
write_parquet,
read_parquet,
)
import time
async def dl_file(url, sem, client):
async with sem:
resp = await client.get(url)
file = url.split("/")[-1]
with open(f"./tracts/{file}", "wb") as ff:
ff.write(resp.content)
async def get_tract_data():
client = httpx.AsyncClient(
headers={
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=0, i",
"sec-ch-ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Brave";v="126"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"sec-gpc": "1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
}
)
census_page = await client.get(
"https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2023&layergroup=Census+Tracts"
)
bs_page = BeautifulSoup(census_page.content, "html.parser")
urls = [
f"https://www2.census.gov/geo/tiger/TIGER2023/TRACT/tl_2023_{x.attrs['value']}_tract.zip"
for x in bs_page.find_all("option")
]
try:
os.mkdir("./tracts")
except FileExistsError:
pass
sem = asyncio.Semaphore(5)
tasks = set()
for url in urls:
tasks.add(asyncio.create_task(dl_file(url, sem, client)))
await asyncio.wait(tasks)
all_states = []
for tract in Path("tracts/2023").iterdir():
all_states.append(gpd.read_file(tract))
df = pd.concat(all_states)
df.to_parquet("tracts2023gpd.parquet")
geodf = GeoTable.from_geopandas(df)
write_parquet(geodf, "tracts2023geo.parquet")
def main():
df = read_parquet("tracts2023geo.parquet")
strt=time.time()
df.geometry.simplify(0.5)
print(f"geoarrow simplify took {time.time()-strt} seconds")
dfgpd = df.to_geopandas()
strt=time.time()
dfgpd.geometry.simplify(0.5)
print(f"geopandas simplify took {time.time()-strt} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment