Skip to content

Instantly share code, notes, and snippets.

@samuelcolvin
Created May 18, 2023 10:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samuelcolvin/088b0d8b7a2f85bd32a71edb865af0cf to your computer and use it in GitHub Desktop.
Save samuelcolvin/088b0d8b7a2f85bd32a71edb865af0cf to your computer and use it in GitHub Desktop.
import polars as pl
pl.Config.set_tbl_rows(30)
ds = pl.read_parquet('page_views.parquet').sort('ts')
path_counts = (
ds.with_columns(pl.col('path').str.replace(r'^/(latest|dev-v\d|\d\.\d+)', ''))
.groupby(pl.col('path'))
.agg([pl.count().alias('count')])
.with_columns((pl.col('count') / pl.sum('count') * 100).alias('percentage'))
.sort('count', descending=True)
.head(30)
)
print(path_counts)
from datetime import datetime
from pathlib import Path
import polars as pl
from pydantic import BaseModel, Field, TypeAdapter, AliasPath
from devtools import debug
class PageView(BaseModel):
user_id: str
ts: datetime = Field(validation_alias=AliasPath('ts', '$date'))
path: str = Field('?', validation_alias=AliasPath('url', 'path'))
referer: str = Field('?')
data = Path('docs_pydantic_dev.json').read_bytes()
ta = TypeAdapter(list[PageView])
with debug.timer('load data into Pydantic'):
# takes 57s for just `json.loads`
models = ta.validate_json(data)
print(f'loaded {len(models)} models')
debug(models[:3])
with debug.timer('create df'):
# load the data directly into
df = pl.DataFrame(models)
print(df)
df.write_parquet('page_views.parquet')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment