Skip to content

Instantly share code, notes, and snippets.

import httpx
import asyncio
from bs4 import BeautifulSoup
import os
import geopandas as gpd
import pandas as pd
from pathlib import Path
from geoarrow.rust.core import (
GeoTable,
write_parquet,
def parse_dtypes(df, exclude=[]):
str_cols = [x for x, y in df.schema.items() if y == pl.String and x not in exclude]
try_casts = df.select(
pl.struct(pl.all()).alias("original"),
pl.struct(
pl.coalesce(
pl.col(col).str.strptime(pl.Datetime, x, strict=False)
for x in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"]
)
for col in str_cols
@deanm0000
deanm0000 / add_prints_to_rust.py
Last active June 7, 2024 15:00
add eprint everywhere
from pathlib import Path
import re
rootpath = Path("./polars/crates")
for p in rootpath.rglob("*.rs"):
with p.open() as f:
filestr = f.read()
if filestr.find("fn") == -1:
continue
@deanm0000
deanm0000 / benchmark_filters.py
Last active March 20, 2024 15:16
benchmark filtering list in polars series
import polars as pl
import numpy as np
from itertools import product
import time
from datetime import datetime
import json
def gen_long_string(str_len=10, n_rows=10_000_000):
rng = np.random.default_rng()
@deanm0000
deanm0000 / calpl.py
Last active February 1, 2024 17:14
function to extract a sheet from CalamineWorkBook into a polars df
def pl_cal_sheet(
wb: CalamineWorkbook,
sheet: str,
header_rows: int | None = None,
header_merge_char: str = "_",
skip_rows: int = 0,
infer_schema_length: int = 1000,
infer_schema_minrow: int = 10,
column_dupe_name_seperator: str = "_",
) -> pl.DataFrame: