Skip to content

Instantly share code, notes, and snippets.

@deanm0000
deanm0000 / calpl.py
Last active February 1, 2024 17:14
function to extract a sheet from CalamineWorkBook into a polars df
def pl_cal_sheet(
wb: CalamineWorkbook,
sheet: str,
header_rows: int | None = None,
header_merge_char: str = "_",
skip_rows: int = 0,
infer_schema_length: int = 1000,
infer_schema_minrow: int = 10,
column_dupe_name_seperator: str = "_",
) -> pl.DataFrame:
@deanm0000
deanm0000 / benchmark_filters.py
Last active March 20, 2024 15:16
benchmark filtering list in polars series
import polars as pl
import numpy as np
from itertools import product
import time
from datetime import datetime
import json
def gen_long_string(str_len=10, n_rows=10_000_000):
rng = np.random.default_rng()
@deanm0000
deanm0000 / add_prints_to_rust.py
Last active June 7, 2024 15:00
add eprint everywhere
from pathlib import Path
import re
rootpath = Path("./polars/crates")
for p in rootpath.rglob("*.rs"):
with p.open() as f:
filestr = f.read()
if filestr.find("fn") == -1:
continue
def parse_dtypes(df, exclude=[]):
str_cols = [x for x, y in df.schema.items() if y == pl.String and x not in exclude]
try_casts = df.select(
pl.struct(pl.all()).alias("original"),
pl.struct(
pl.coalesce(
pl.col(col).str.strptime(pl.Datetime, x, strict=False)
for x in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"]
)
for col in str_cols
import httpx
import asyncio
from bs4 import BeautifulSoup
import os
import geopandas as gpd
import pandas as pd
from pathlib import Path
from geoarrow.rust.core import (
GeoTable,
write_parquet,