Skip to content

Instantly share code, notes, and snippets.

@jexp
Last active February 11, 2024 13:31
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jexp/cbd6d1d234280444550fcc20dd46b58f to your computer and use it in GitHub Desktop.
Save jexp/cbd6d1d234280444550fcc20dd46b58f to your computer and use it in GitHub Desktop.
DuckDB StackOverflow Queries, DuckDB in Action Book: https://manning.com/books/duckdb-in-action – 40% discount: mlneedham 100% for reviewers

Code along on motherduck.com

ATTACH 'md:_share/stackoverflow/6c318917-6888-425a-bea1-5860c29947e5'

SELECT count(*)
FROM read_csv_auto('Tags.csv.gz');
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        64465 │
└──────────────┘
DESCRIBE(SELECT * from read_csv_auto('Tags.csv.gz') LIMIT 1);
┌───────────────┬─────────────┐
│  column_name  │ column_type │
│    varchar    │   varchar   │
├───────────────┼─────────────┤
│ Id            │ BIGINT      │
│ TagName       │ VARCHAR     │
│ Count         │ BIGINT      │
│ ExcerptPostId │ BIGINT      │
│ WikiPostId    │ BIGINT      │
└───────────────┴─────────────┘
SELECT TagName, Count
FROM read_csv('Tags.csv.gz',column_names=['Id','TagName','Count'],auto_detect=true)
ORDER BY Count DESC LIMIT 5;
┌────────────┬─────────┐
│  TagName   │  Count  │
│  varchar   │  int64  │
├────────────┼─────────┤
│ javascript │ 2479947 │
│ python     │ 2113196 │
│ java       │ 1889767 │
│ c#         │ 1583879 │
│ php        │ 1456271 │
└────────────┴─────────┘
CREATE TABLE tags AS
FROM read_csv('Tags.csv.gz',column_names=['Id','TagName','Count'],auto_detect=true)

CREATE TABLE users AS
SELECT * from read_csv('Users.csv.gz',auto_detect=true,
    column_names=['Id','Reputation','CreationDate','DisplayName',
    'LastAccessDate','AboutMe','Views','UpVotes','DownVotes']);

-- 19942787 rows

-- we can leave off the select *
CREATE TABLE posts AS 
FROM read_csv('Posts.csv.gz',auto_detect=true,
    column_names=['Id','PostTypeId','AcceptedAnswerId','CreationDate',
    'Score','ViewCount','Body','OwnerUserId','LastEditorUserId',
    'LastEditorDisplayName','LastEditDate','LastActivityDate','Title',
    'Tags','AnswerCount','CommentCount','FavoriteCount',
    'CommunityOwnedDate','ContentLicense']);

-- 58329356 rows


.timer on

SELECT DisplayName, Reputation, LastAccessDate
FROM users ORDER BY Reputation DESC LIMIT 5;
┌─────────────────┬────────────┬─────────────────────────┐
│   DisplayName   │ Reputation │     LastAccessDate      │
│     varchar     │   int64    │        timestamp        │
├─────────────────┼────────────┼─────────────────────────┤
│ Jon Skeet       │    1389256 │ 2023-03-04 19:54:19.74  │
│ Gordon Linoff   │    1228338 │ 2023-03-04 15:16:02.617 │
│ VonC            │    1194435 │ 2023-03-05 01:48:58.937 │
│ BalusC          │    1069162 │ 2023-03-04 12:49:24.637 │
│ Martijn Pieters │    1016741 │ 2023-03-03 19:35:13.76  │
└─────────────────┴────────────┴─────────────────────────┘
Run Time (s): real 0.126 user 2.969485 sys 1.696962
SELECT  year(CreationDate) as year, count(*),
        round(avg(ViewCount)), max(AnswerCount)
FROM posts
GROUP BY year ORDER BY year DESC LIMIT 10;
┌───────┬──────────────┬───────────────────────┬──────────────────┐
│ year  │ count_star() │ round(avg(ViewCount)) │ max(AnswerCount) │
│ int64 │    int64     │        double         │      int64       │
├───────┼──────────────┼───────────────────────┼──────────────────┤
│  2023 │       528575 │                  44.0 │               15 │
│  2022 │      3353468 │                 265.0 │               44 │
│  2021 │      3553972 │                 580.0 │               65 │
│  2020 │      4313416 │                 847.0 │               59 │
│  2019 │      4164538 │                1190.0 │               60 │
│  2018 │      4444220 │                1648.0 │              121 │
│  2017 │      5022978 │                1994.0 │               65 │
│  2016 │      5277269 │                2202.0 │               74 │
│  2015 │      5347794 │                2349.0 │               82 │
│  2014 │      5342607 │                2841.0 │               92 │
├───────┴──────────────┴───────────────────────┴──────────────────┤
│ 10 rows                                               4 columns │
└─────────────────────────────────────────────────────────────────┘
Run Time (s): real 5.977 user 7.498157 sys 5.480121 (1st run)
Run Time (s): real 0.039 user 4.609049 sys 0.078694
COPY (FROM users) TO 'users.parquet'
(FORMAT PARQUET, CODEC 'SNAPPY', ROW_GROUP_SIZE 100000);
-- Run Time (s): real 10.582 user 62.737265 sys 65.422181

COPY (FROM posts) TO 'posts.parquet'
(FORMAT PARQUET, CODEC 'SNAPPY', ROW_GROUP_SIZE 100000);
-- Run Time (s): real 57.314 user 409.517658 sys 334.606894

Upload database

CREATE DATABASE remote_database_name FROM CURRENT_DATABASE();

-- or more generally
CREATE DATABASE remote_database_name FROM '<local database name>';

Create database & tables

create database so;

create table users as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/users.parquet';
-- Run Time (s): real 10.401 user 0.006417 sys 0.003527

describe users;
┌────────────────┬─────────────┐
│  column_name   │ column_type │
│    varchar     │   varchar   │
├────────────────┼─────────────┤
│ Id             │ BIGINT      │
│ Reputation     │ BIGINT      │
│ CreationDate   │ TIMESTAMP   │
│ DisplayName    │ VARCHAR     │
│ LastAccessDate │ TIMESTAMP   │
│ AboutMe        │ VARCHAR     │
│ Views          │ BIGINT      │
│ UpVotes        │ BIGINT      │
│ DownVotes      │ BIGINT      │
│ Id             │ BIGINT      │
│ Reputation     │ BIGINT      │
│ CreationDate   │ TIMESTAMP   │
│ DisplayName    │ VARCHAR     │
│ LastAccessDate │ TIMESTAMP   │
│ AboutMe        │ VARCHAR     │
│ Views          │ BIGINT      │
│ UpVotes        │ BIGINT      │
│ DownVotes      │ BIGINT      │
├────────────────┴─────────────┤
│ 18 rows                      │
└──────────────────────────────┘
Run Time (s): real 0.032 user 0.026184 sys 0.002383
-- do the same for the other tables

create table comments as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/comments.parquet';
create table posts as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/posts.parquet';
create table votes as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/votes.parquet';
create table badges as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/badges.parquet';
create table post_links as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/post_links.parquet';
create table tags as
from 's3://us-prd-motherduck-open-datasets/stackoverflow/parquet/2023-05/tags.parquet';

AI/LLM Query Commands

CALL prompt_schema();

-- summary = The database contains information related to posts, comments, votes, badges, tags, post links, and users for a platform.

-- Run Time (s): real 1.476 user 0.001069 sys 0.000778

-- summary = The database schema represents a collection of data about various aspects of a community platform, including users, posts, comments, tags, badges, votes, and post links.

.mode duckbox
pragma prompt_query('What are the most popular tags?');
┌────────────┬─────────┐
│  TagName   │  Count  │
│  varchar   │  int64  │
├────────────┼─────────┤
│ javascript │ 2479947 │
│ python     │ 2113196 │
│ java       │ 1889767 │
│ c#         │ 1583879 │
│ php        │ 1456271 │
│ android    │ 1400026 │
│ html       │ 1167742 │
│ jquery     │ 1033113 │
│ c++        │  789699 │
│ css        │  787138 │
├────────────┴─────────┤
│ 10 rows    2 columns │
└──────────────────────┘
-- Run Time (s): real 3.763 user 0.124567 sys 0.001716
.mode line
call prompt_sql('What are the most popular tags?');

-- query = SELECT TagName, Count FROM tags ORDER BY Count DESC LIMIT 5;
-- Run Time (s): real 2.813 user 2.808042 sys 0.005866

pragma prompt_query("What question has the highest score and what are it's other attributes?");
                   Id = 11227809
           PostTypeId = 1
     AcceptedAnswerId = 11227902
         CreationDate = 2012-06-27 13:51:36.16
                Score = 26903
            ViewCount = 1796363
                 Body =
          OwnerUserId = 87234
     LastEditorUserId = 87234
LastEditorDisplayName =
         LastEditDate = 2022-10-12 18:56:47.68
     LastActivityDate = 2023-01-10 04:40:07.12
                Title = Why is processing a sorted array faster than processing an unsorted array?
                 Tags = <java><c++><performance><cpu-architecture><branch-prediction>
          AnswerCount = 26
         CommentCount = 9
        FavoriteCount = 0
   CommunityOwnedDate =
       ContentLicense = CC BY-SA 4.0
call prompt_sql("What question has the highest score and what are it's other attributes?");
query = SELECT *
FROM posts
WHERE PostTypeId = 1
ORDER BY Score DESC
LIMIT 1;
-- Run Time (s): real 3.683 user 0.001970 sys 0.000994
.mode duckbox
pragma prompt_query("Which 5 questions have the most comments, what is the post title and comment count");
┌───────────────────────────────────────────────────────────────────────────┬───────────────┐
│                                          Title                            │ comment_count │
│                                         varchar                           │     int64     │
├───────────────────────────────────────────────────────────────────────────┼───────────────┤
│ UIImageView Frame Doesnt Reflect Constraints                              │           108 │
│ Is it possible to use adb commands to click on a view by finding its ID?  │           102 │
│ How to create a new web character symbol recognizable by html/javascript? │           100 │
│ Why isnt my CSS3 animation smooth in Google Chrome (but very smooth on ot │            89 │
│ Heap Gives Page Fault                                                     │            89 │
└───────────────────────────────────────────────────────────────────────────┴───────────────┘
Run Time (s): real 19.695 user 2.406446 sys 0.018353
.mode line
call prompt_sql("Which 5 questions have the most comments, what is the post title and comment count");

query = SELECT p.Title, COUNT(c.Id) AS comment_count
FROM posts p
JOIN comments c ON p.Id = c.PostId AND p.PostTypeId = 1
GROUP BY p.Title
ORDER BY comment_count DESC
LIMIT 5;
-- Run Time (s): real 4.795 user 0.002301 sys 0.001346
call prompt_sql("System: No joins! User: Which 5 questions have the most comments, what is the post title and comment count");

-- query = 
SELECT Title, CommentCount
FROM posts
WHERE PostTypeId = 1
ORDER BY CommentCount DESC
LIMIT 5;
-- Run Time (s): real 3.587 user 0.001733 sys 0.000865
call prompt_fixup("select postTypeId, count(*) from posts");

-- query = 
SELECT postTypeId, COUNT(*) FROM posts GROUP BY postTypeId
-- Run Time (s): real 12.006 user 0.004266 sys 0.002980

call prompt_fixup("select count(*) from posts join users on posts.userId = users.userId");

-- query = 
SELECT COUNT(*) FROM posts JOIN users ON posts.OwnerUserId = users.Id
-- Run Time (s): real 2.378 user 0.001770 sys 0.001067

Create & Attach Shares

-- CREATE SHARE <share name> [FROM <database name>];
CREATE SHARE so_2023_05 FROM so;
-- share_url = md:_share/so/373594a2-06f7-4c33-814e-cf59028482ca
-- Run Time (s): real 63.335 user 0.014849 sys 0.013110

-- ATTACH '<share URL>' [AS <database name>];
ATTACH 'md:_share/so/373594a2-06f7-4c33-814e-cf59028482ca' AS so;

-- show the contents of the share
DESCRIBE SHARE "so_2023_05";

LIST SHARES;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment