Code along on motherduck.com
ATTACH 'md:_share/stackoverflow/6c318917-6888-425a-bea1-5860c29947e5'
- Blog Post Part 1: https://motherduck.com/blog/exploring-stackoverflow-with-duckdb-on-motherduck-1/
- Blog Post Part 2: https://motherduck.com/blog/exploring-stackoverflow-with-duckdb-on-motherduck-2/
- Free DDBIA Ebook: https://motherduck.com/duckdb-book
SELECT count(*)
FROM read_csv_auto('Tags.csv.gz');
┌──────────────┐
│ count_star() │
│ int64 │
├──────────────┤
│ 64465 │
└──────────────┘
DESCRIBE(SELECT * from read_csv_auto('Tags.csv.gz') LIMIT 1);
┌───────────────┬─────────────┐
│ column_name │ column_type │
│ varchar │ varchar │
├───────────────┼─────────────┤
│ Id │ BIGINT │
│ TagName │ VARCHAR │
│ Count │ BIGINT │
│ ExcerptPostId │ BIGINT │
│ WikiPostId │ BIGINT │
└───────────────┴─────────────┘
SELECT TagName, Count
FROM read_csv('Tags.csv.gz',column_names=['Id','TagName','Count'],auto_detect=true)
ORDER BY Count DESC LIMIT 5;
┌────────────┬─────────┐
│ TagName │ Count │
│ varchar │ int64 │
├────────────┼─────────┤
│ javascript │ 2479947 │
│ python │ 2113196 │
│ java │ 1889767 │
│ c# │ 1583879 │
│ php │ 1456271 │
└────────────┴─────────┘
CREATE TABLE tags AS
FROM read_csv('Tags.csv.gz',column_names=['Id','TagName','Count'],auto_detect=true)
CREATE TABLE users AS
SELECT * from read_csv('Users.csv.gz',auto_detect=true,
column_names=['Id','Reputation','CreationDate','DisplayName',
'LastAccessDate','AboutMe','Views','UpVotes','DownVotes']);
-- 19942787 rows
-- we can leave off the select *
CREATE TABLE posts AS
FROM read_csv('Posts.csv.gz',auto_detect=true,
column_names=['Id','PostTypeId','AcceptedAnswerId','CreationDate',
'Score','ViewCount','Body','OwnerUserId','LastEditorUserId',
'LastEditorDisplayName','LastEditDate','LastActivityDate','Title',
'Tags','AnswerCount','CommentCount','FavoriteCount',
'CommunityOwnedDate','ContentLicense']);
-- 58329356 rows
.timer on
SELECT DisplayName, Reputation, LastAccessDate
FROM users ORDER BY Reputation DESC LIMIT 5;
┌─────────────────┬────────────┬─────────────────────────┐
│ DisplayName │ Reputation │ LastAccessDate │
│ varchar │ int64 │ timestamp │
├─────────────────┼────────────┼─────────────────────────┤
│ Jon Skeet │ 1389256 │ 2023-03-04 19:54:19.74 │
│ Gordon Linoff │ 1228338 │ 2023-03-04 15:16:02.617 │
│ VonC │ 1194435 │ 2023-03-05 01:48:58.937 │
│ BalusC │ 1069162 │ 2023-03-04 12:49:24.637 │
│ Martijn Pieters │ 1016741 │ 2023-03-03 19:35:13.76 │
└─────────────────┴────────────┴─────────────────────────┘
Run Time (s): real 0.126 user 2.969485 sys 1.696962
SELECT year(CreationDate) as year, count(*),
round(avg(ViewCount)), max(AnswerCount)
FROM posts
GROUP BY year ORDER BY year DESC LIMIT 10;
┌───────┬──────────────┬───────────────────────┬──────────────────┐
│ year │ count_star() │ round(avg(ViewCount)) │ max(AnswerCount) │
│ int64 │ int64 │ double │ int64 │
├───────┼──────────────┼───────────────────────┼──────────────────┤
│ 2023 │ 528575 │ 44.0 │ 15 │
│ 2022 │ 3353468 │ 265.0 │ 44 │
│ 2021 │ 3553972 │ 580.0 │ 65 │
│ 2020 │ 4313416 │ 847.0 │ 59 │
│ 2019 │ 4164538 │ 1190.0 │ 60 │
│ 2018 │ 4444220 │ 1648.0 │ 121 │
│ 2017 │ 5022978 │ 1994.0 │ 65 │
│ 2016 │ 5277269 │ 2202.0 │ 74 │
│ 2015 │ 5347794 │ 2349.0 │ 82 │
│ 2014 │ 5342607 │ 2841.0 │ 92 │
├───────┴──────────────┴───────────────────────┴──────────────────┤
│ 10 rows 4 columns │
└─────────────────────────────────────────────────────────────────┘
Run Time (s): real 5.977 user 7.498157 sys 5.480121 (1st run)
Run Time (s): real 0.039 user 4.609049 sys 0.078694
COPY (FROM users) TO 'users.parquet'
(FORMAT PARQUET, CODEC 'SNAPPY', ROW_GROUP_SIZE 100000);
-- Run Time (s): real 10.582 user 62.737265 sys 65.422181
COPY (FROM posts) TO 'posts.parquet'
(FORMAT PARQUET, CODEC 'SNAPPY', ROW_GROUP_SIZE 100000);
-- Run Time (s): real 57.314 user 409.517658 sys 334.606894
Upload database
CREATE DATABASE remote_database_name FROM CURRENT_DATABASE();
-- or more generally
CREATE DATABASE remote_database_name FROM '<local database name>';
Create database & tables
create database so;
create table users as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/users.parquet';
-- Run Time (s): real 10.401 user 0.006417 sys 0.003527
describe users;
┌────────────────┬─────────────┐
│ column_name │ column_type │
│ varchar │ varchar │
├────────────────┼─────────────┤
│ Id │ BIGINT │
│ Reputation │ BIGINT │
│ CreationDate │ TIMESTAMP │
│ DisplayName │ VARCHAR │
│ LastAccessDate │ TIMESTAMP │
│ AboutMe │ VARCHAR │
│ Views │ BIGINT │
│ UpVotes │ BIGINT │
│ DownVotes │ BIGINT │
│ Id │ BIGINT │
│ Reputation │ BIGINT │
│ CreationDate │ TIMESTAMP │
│ DisplayName │ VARCHAR │
│ LastAccessDate │ TIMESTAMP │
│ AboutMe │ VARCHAR │
│ Views │ BIGINT │
│ UpVotes │ BIGINT │
│ DownVotes │ BIGINT │
├────────────────┴─────────────┤
│ 18 rows │
└──────────────────────────────┘
Run Time (s): real 0.032 user 0.026184 sys 0.002383
-- do the same for the other tables
create table comments as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/comments.parquet';
create table posts as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/posts.parquet';
create table votes as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/votes.parquet';
create table badges as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/badges.parquet';
create table post_links as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/post_links.parquet';
create table tags as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/tags.parquet';
AI/LLM Query Commands
CALL prompt_schema();
-- summary = The database contains information related to posts, comments, votes, badges, tags, post links, and users for a platform.
-- Run Time (s): real 1.476 user 0.001069 sys 0.000778
-- summary = The database schema represents a collection of data about various aspects of a community platform, including users, posts, comments, tags, badges, votes, and post links.
.mode duckbox
pragma prompt_query('What are the most popular tags?');
┌────────────┬─────────┐
│ TagName │ Count │
│ varchar │ int64 │
├────────────┼─────────┤
│ javascript │ 2479947 │
│ python │ 2113196 │
│ java │ 1889767 │
│ c# │ 1583879 │
│ php │ 1456271 │
│ android │ 1400026 │
│ html │ 1167742 │
│ jquery │ 1033113 │
│ c++ │ 789699 │
│ css │ 787138 │
├────────────┴─────────┤
│ 10 rows 2 columns │
└──────────────────────┘
-- Run Time (s): real 3.763 user 0.124567 sys 0.001716
.mode line
call prompt_sql('What are the most popular tags?');
-- query = SELECT TagName, Count FROM tags ORDER BY Count DESC LIMIT 5;
-- Run Time (s): real 2.813 user 2.808042 sys 0.005866
pragma prompt_query("What question has the highest score and what are it's other attributes?");
Id = 11227809
PostTypeId = 1
AcceptedAnswerId = 11227902
CreationDate = 2012-06-27 13:51:36.16
Score = 26903
ViewCount = 1796363
Body =
OwnerUserId = 87234
LastEditorUserId = 87234
LastEditorDisplayName =
LastEditDate = 2022-10-12 18:56:47.68
LastActivityDate = 2023-01-10 04:40:07.12
Title = Why is processing a sorted array faster than processing an unsorted array?
Tags = <java><c++><performance><cpu-architecture><branch-prediction>
AnswerCount = 26
CommentCount = 9
FavoriteCount = 0
CommunityOwnedDate =
ContentLicense = CC BY-SA 4.0
call prompt_sql("What question has the highest score and what are it's other attributes?");
query = SELECT *
FROM posts
WHERE PostTypeId = 1
ORDER BY Score DESC
LIMIT 1;
-- Run Time (s): real 3.683 user 0.001970 sys 0.000994
.mode duckbox
pragma prompt_query("Which 5 questions have the most comments, what is the post title and comment count");
┌───────────────────────────────────────────────────────────────────────────┬───────────────┐
│ Title │ comment_count │
│ varchar │ int64 │
├───────────────────────────────────────────────────────────────────────────┼───────────────┤
│ UIImageView Frame Doesnt Reflect Constraints │ 108 │
│ Is it possible to use adb commands to click on a view by finding its ID? │ 102 │
│ How to create a new web character symbol recognizable by html/javascript? │ 100 │
│ Why isnt my CSS3 animation smooth in Google Chrome (but very smooth on ot │ 89 │
│ Heap Gives Page Fault │ 89 │
└───────────────────────────────────────────────────────────────────────────┴───────────────┘
Run Time (s): real 19.695 user 2.406446 sys 0.018353
.mode line
call prompt_sql("Which 5 questions have the most comments, what is the post title and comment count");
query = SELECT p.Title, COUNT(c.Id) AS comment_count
FROM posts p
JOIN comments c ON p.Id = c.PostId AND p.PostTypeId = 1
GROUP BY p.Title
ORDER BY comment_count DESC
LIMIT 5;
-- Run Time (s): real 4.795 user 0.002301 sys 0.001346
call prompt_sql("System: No joins! User: Which 5 questions have the most comments, what is the post title and comment count");
-- query =
SELECT Title, CommentCount
FROM posts
WHERE PostTypeId = 1
ORDER BY CommentCount DESC
LIMIT 5;
-- Run Time (s): real 3.587 user 0.001733 sys 0.000865
call prompt_fixup("select postTypeId, count(*) from posts");
-- query =
SELECT postTypeId, COUNT(*) FROM posts GROUP BY postTypeId
-- Run Time (s): real 12.006 user 0.004266 sys 0.002980
call prompt_fixup("select count(*) from posts join users on posts.userId = users.userId");
-- query =
SELECT COUNT(*) FROM posts JOIN users ON posts.OwnerUserId = users.Id
-- Run Time (s): real 2.378 user 0.001770 sys 0.001067
Create & Attach Shares
-- CREATE SHARE <share name> [FROM <database name>];
CREATE SHARE so_2023_05 FROM so;
-- share_url = md:_share/so/373594a2-06f7-4c33-814e-cf59028482ca
-- Run Time (s): real 63.335 user 0.014849 sys 0.013110
-- ATTACH '<share URL>' [AS <database name>];
ATTACH 'md:_share/so/373594a2-06f7-4c33-814e-cf59028482ca' AS so;
-- show the contents of the share
DESCRIBE SHARE "so_2023_05";
LIST SHARES;