mneedham/01_enums.py

## 01_enums.py
# Dataset: https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m

import duckdb

db1 = duckdb.connect('db1.duck.db')
db2 = duckdb.connect('db2.duck.db')

db1.sql("""
CREATE OR REPLACE TABLE recipes AS
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title, source, cast(NER AS varchar[]) AS NER, site, link;
""")

db2.sql("""
CREATE TYPE Source AS ENUM (
    'Gathered',
    'Recipes1M'
);
""")

db2.sql("""
CREATE TYPE Site AS ENUM (
    FROM read_csv_auto('recipes_data.csv', header=True)
    SELECT site
    WHERE site IS NOT NULL
);
""")

db2.sql("""
create or replace table recipes AS
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title,
       CAST(source AS Source) AS source,
       cast(NER AS varchar[]) AS NER,
       CAST(site AS Site) AS site,
       link;
""")

## 02_cardinalities.py
db2.sql("FROM recipes SELECT source, count(*) GROUP BY ALL")
# ┌───────────┬──────────────┐
# │  source   │ count_star() │
# │  source   │    int64     │
# ├───────────┼──────────────┤
# │ Gathered  │      1643098 │
# │ Recipes1M │       588044 │
# └───────────┴──────────────┘

db2.sql("FROM recipes SELECT site, count(*) GROUP BY ALL")
# ┌─────────────────────────┬──────────────┐
# │          site           │ count_star() │
# │          site           │    int64     │
# ├─────────────────────────┼──────────────┤
# │ www.cookbooks.com       │       896341 │
# │ www.food.com            │       499616 │
# │ cooking.nytimes.com     │        16367 │
# │ cookpad.com             │        61020 │
# │ www.epicurious.com      │       129444 │
# │ cookeatshare.com        │        59307 │
# │ recipeland.com          │        24418 │
# │ tastykitchen.com        │        78768 │
# │ allrecipes.com          │         3204 │
# │ www.foodnetwork.com     │        49443 │
# │         ·               │           ·  │
# │         ·               │           ·  │
# │         ·               │           ·  │
# │ www.landolakes.com      │         2492 │
# │ www.vegetariantimes.com │         4578 │
# │ www.delish.com          │         3880 │
# │ www.myrecipes.com       │        64895 │
# │ recipes-plus.com        │        20524 │
# │ food52.com              │        48501 │
# │ www.seriouseats.com     │        12632 │
# │ www.tasteofhome.com     │        51594 │
# │ www.yummly.com          │        51963 │
# │ www.allrecipes.com      │        61398 │
# ├─────────────────────────┴──────────────┤
# │ 28 rows (20 shown)           2 columns │
# └────────────────────────────────────────┘

## 03_file_size.py
import os
import glob

for db in glob.glob("*.duck.db"):
    print(db, os.stat(db).st_size)

# db2.duck.db 132395008
# db1.duck.db 131870720

## 04_enums.md

      
    Raw
  

              04_enums.md
            
          
    Dataset:
https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m
Tab 1:
./duckdb db1.duck.db
CREATE OR REPLACE TABLE recipes AS 
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title, source, cast(NER AS varchar[]) AS NER, site, link;
Tab 2:
./duckdb db2.duck.db
CREATE TYPE Source AS ENUM (
    'Gathered', 
    'Recipes1M'
);

CREATE TYPE Site AS ENUM (
    FROM read_csv_auto('recipes_data.csv', header=True)
    SELECT site
    WHERE site IS NOT NULL
);

create or replace table recipes AS 
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title, 
       CAST(source AS Source) AS source,
       cast(NER AS varchar[]) AS NER,
       CAST(site AS Site) AS site,
       link;
Tab 3:
du -k  -d 1 db*.db

131084	db1.duck.db
131084	db2.duck.db
	# Dataset: https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m

	import duckdb

	db1 = duckdb.connect('db1.duck.db')
	db2 = duckdb.connect('db2.duck.db')

	db1.sql("""
	CREATE OR REPLACE TABLE recipes AS
	FROM read_csv_auto('recipes_data.csv', header=True)
	SELECT title, source, cast(NER AS varchar[]) AS NER, site, link;
	""")

	db2.sql("""
	CREATE TYPE Source AS ENUM (
	'Gathered',
	'Recipes1M'
	);
	""")

	db2.sql("""
	CREATE TYPE Site AS ENUM (
	FROM read_csv_auto('recipes_data.csv', header=True)
	SELECT site
	WHERE site IS NOT NULL
	);
	""")

	db2.sql("""
	create or replace table recipes AS
	FROM read_csv_auto('recipes_data.csv', header=True)
	SELECT title,
	CAST(source AS Source) AS source,
	cast(NER AS varchar[]) AS NER,
	CAST(site AS Site) AS site,
	link;
	""")
	db2.sql("FROM recipes SELECT source, count(*) GROUP BY ALL")
	# ┌───────────┬──────────────┐
	# │ source │ count_star() │
	# │ source │ int64 │
	# ├───────────┼──────────────┤
	# │ Gathered │ 1643098 │
	# │ Recipes1M │ 588044 │
	# └───────────┴──────────────┘

	db2.sql("FROM recipes SELECT site, count(*) GROUP BY ALL")
	# ┌─────────────────────────┬──────────────┐
	# │ site │ count_star() │
	# │ site │ int64 │
	# ├─────────────────────────┼──────────────┤
	# │ www.cookbooks.com │ 896341 │
	# │ www.food.com │ 499616 │
	# │ cooking.nytimes.com │ 16367 │
	# │ cookpad.com │ 61020 │
	# │ www.epicurious.com │ 129444 │
	# │ cookeatshare.com │ 59307 │
	# │ recipeland.com │ 24418 │
	# │ tastykitchen.com │ 78768 │
	# │ allrecipes.com │ 3204 │
	# │ www.foodnetwork.com │ 49443 │
	# │ · │ · │
	# │ · │ · │
	# │ · │ · │
	# │ www.landolakes.com │ 2492 │
	# │ www.vegetariantimes.com │ 4578 │
	# │ www.delish.com │ 3880 │
	# │ www.myrecipes.com │ 64895 │
	# │ recipes-plus.com │ 20524 │
	# │ food52.com │ 48501 │
	# │ www.seriouseats.com │ 12632 │
	# │ www.tasteofhome.com │ 51594 │
	# │ www.yummly.com │ 51963 │
	# │ www.allrecipes.com │ 61398 │
	# ├─────────────────────────┴──────────────┤
	# │ 28 rows (20 shown) 2 columns │
	# └────────────────────────────────────────┘
	import os
	import glob

	for db in glob.glob("*.duck.db"):
	print(db, os.stat(db).st_size)

	# db2.duck.db 132395008
	# db1.duck.db 131870720