Skip to content

Instantly share code, notes, and snippets.

@mneedham
Last active July 10, 2023 08:24
Show Gist options
  • Save mneedham/a9fbb65155efd10dda9b8e13b4785b38 to your computer and use it in GitHub Desktop.
Save mneedham/a9fbb65155efd10dda9b8e13b4785b38 to your computer and use it in GitHub Desktop.
enums in duckdb
# Dataset: https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m
import duckdb
db1 = duckdb.connect('db1.duck.db')
db2 = duckdb.connect('db2.duck.db')
db1.sql("""
CREATE OR REPLACE TABLE recipes AS
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title, source, cast(NER AS varchar[]) AS NER, site, link;
""")
db2.sql("""
CREATE TYPE Source AS ENUM (
'Gathered',
'Recipes1M'
);
""")
db2.sql("""
CREATE TYPE Site AS ENUM (
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT site
WHERE site IS NOT NULL
);
""")
db2.sql("""
create or replace table recipes AS
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title,
CAST(source AS Source) AS source,
cast(NER AS varchar[]) AS NER,
CAST(site AS Site) AS site,
link;
""")
db2.sql("FROM recipes SELECT source, count(*) GROUP BY ALL")
# ┌───────────┬──────────────┐
# │ source │ count_star() │
# │ source │ int64 │
# ├───────────┼──────────────┤
# │ Gathered │ 1643098 │
# │ Recipes1M │ 588044 │
# └───────────┴──────────────┘
db2.sql("FROM recipes SELECT site, count(*) GROUP BY ALL")
# ┌─────────────────────────┬──────────────┐
# │ site │ count_star() │
# │ site │ int64 │
# ├─────────────────────────┼──────────────┤
# │ www.cookbooks.com │ 896341 │
# │ www.food.com │ 499616 │
# │ cooking.nytimes.com │ 16367 │
# │ cookpad.com │ 61020 │
# │ www.epicurious.com │ 129444 │
# │ cookeatshare.com │ 59307 │
# │ recipeland.com │ 24418 │
# │ tastykitchen.com │ 78768 │
# │ allrecipes.com │ 3204 │
# │ www.foodnetwork.com │ 49443 │
# │ · │ · │
# │ · │ · │
# │ · │ · │
# │ www.landolakes.com │ 2492 │
# │ www.vegetariantimes.com │ 4578 │
# │ www.delish.com │ 3880 │
# │ www.myrecipes.com │ 64895 │
# │ recipes-plus.com │ 20524 │
# │ food52.com │ 48501 │
# │ www.seriouseats.com │ 12632 │
# │ www.tasteofhome.com │ 51594 │
# │ www.yummly.com │ 51963 │
# │ www.allrecipes.com │ 61398 │
# ├─────────────────────────┴──────────────┤
# │ 28 rows (20 shown) 2 columns │
# └────────────────────────────────────────┘
import os
import glob
for db in glob.glob("*.duck.db"):
print(db, os.stat(db).st_size)
# db2.duck.db 132395008
# db1.duck.db 131870720

Dataset: https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m

Tab 1:

./duckdb db1.duck.db
CREATE OR REPLACE TABLE recipes AS 
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title, source, cast(NER AS varchar[]) AS NER, site, link;

Tab 2:

./duckdb db2.duck.db
CREATE TYPE Source AS ENUM (
    'Gathered', 
    'Recipes1M'
);

CREATE TYPE Site AS ENUM (
    FROM read_csv_auto('recipes_data.csv', header=True)
    SELECT site
    WHERE site IS NOT NULL
);

create or replace table recipes AS 
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title, 
       CAST(source AS Source) AS source,
       cast(NER AS varchar[]) AS NER,
       CAST(site AS Site) AS site,
       link;

Tab 3:

du -k  -d 1 db*.db

131084	db1.duck.db
131084	db2.duck.db
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment