Dataset: https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m
Tab 1:
./duckdb db1.duck.db
CREATE OR REPLACE TABLE recipes AS
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title, source, cast(NER AS varchar[]) AS NER, site, link;
Tab 2:
./duckdb db2.duck.db
CREATE TYPE Source AS ENUM (
'Gathered',
'Recipes1M'
);
CREATE TYPE Site AS ENUM (
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT site
WHERE site IS NOT NULL
);
create or replace table recipes AS
FROM read_csv_auto('recipes_data.csv', header=True)
SELECT title,
CAST(source AS Source) AS source,
cast(NER AS varchar[]) AS NER,
CAST(site AS Site) AS site,
link;
Tab 3:
du -k -d 1 db*.db
131084 db1.duck.db
131084 db2.duck.db