Skip to content

Instantly share code, notes, and snippets.

View gingerwizard's full-sized avatar

Dale McDiarmid gingerwizard

View GitHub Profile

Download data

wget https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/nyc-taxi-vectors.csv.gz
gzip -d nyc-taxi-vectors.csv.gz

Install Dependencies

@gingerwizard
gingerwizard / loading_100m_transactions.md
Created April 9, 2024 12:53
Loading 100m transactions for kmeans
-- data table
CREATE TABLE transactions
(
  id UInt32,
  vector Array(Float32),
  customer UInt32,
)
ENGINE = MergeTree -- this can be a Null engine
ORDER BY id
CREATE TABLE surveys
(
    `response_id` Int64,
    `development_activity` Enum8('I am a developer by profession' = 1, 'I am a student who is learning to code' = 2, 'I am not primarily a developer, but I write code sometimes as part of my work' = 3, 'I code primarily as a hobby' = 4, 'I used to be a developer by profession, but no longer am' = 5, 'None of these' = 6, 'NA' = 7),
    `employment` Enum8('Independent contractor, freelancer, or self-employed' = 1, 'Student, full-time' = 2, 'Employed full-time' = 3, 'Student, part-time' = 4, 'I prefer not to say' = 5, 'Employed part-time' = 6, 'Not employed, but looking for work' = 7, 'Retired' = 8, 'Not employed, and not looking for work' = 9, 'NA' = 10),
    `country` LowCardinality(String),
    `us_state` LowCardinality(String),
    `uk_county` LowCardinality(String),
    `education_level` Enum8('Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)' = 1, 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)' = 2, 'Master’s degree (M.
-- original table
CREATE TABLE hackernews_copy
(
    `id` String,
    `doc_id` String,
    `comment` String,
    `vector` Array(Float32),
    `node_info` Tuple(start Nullable(UInt64), end Nullable(UInt64)),
    `metadata` String,
#!/bin/bash
if [[ -z "$CLOUD_ID" || -z "$CLOUD_SECRET" || -z "$AWS_ACCESS_KEY_ID" || -z "$AWS_SECRET_ACCESS_KEY" ]]; then
echo "Error: Required environment variables are not set."
exit 1
fi
# identify the organization to create the service in
ORG_ID=$(curl --silent --user $CLOUD_ID:$CLOUD_SECRET https://api.clickhouse.cloud/v1/organizations | jq -r '.result[0].id')
ORG_NAME=$(curl --silent --user $CLOUD_ID:$CLOUD_SECRET https://api.clickhouse.cloud/v1/organizations | jq -r '.result[0].name')
diff -u <(docker run --rm clickhouse/clickhouse-server:23.12 clickhouse-local --query "SELECT * FROM system.contributors ORDER BY name") <(docker run --rm clickhouse/clickhouse-server:24.1 clickhouse-local --query "SELECT * FROM system.contributors ORDER BY name") | grep -E "^\+" | tail -n +2 | sed 's/^\+//' | tr '\n' ','
CREATE EXTERNAL TABLE IF NOT EXISTS ookla (
  quadkey string,
  tile string,
  avg_d_kbps int,
  avg_u_kbps int,
  avg_lat_ms int,
  avg_lat_down_ms int,
  avg_lat_up_ms int,
 tests int,
@gingerwizard
gingerwizard / ski_resort_stats.csv
Created January 12, 2024 17:14
ski_resort_stats.csv
resort_name state summit base vertical lifts runs acres green_percent green_acres blue_percent blue_acres black_percent black_acres lat lon
49(degrees) North, WA Washington 5774 3923 1851 6 68 2325 0.3 697.5 0.4 930 0.3 697.5 49 -115.84
Afton Alps, MN Minnesota 700 350 350 21 48 300 0.2 60 0.6 180 0.2 60 44.85 -92.79
Alpine Meadows, CA California 8673 6385 2288 13 100 2400 0.25 600 0.4 960 0.35 840 39.17 -120.22
Alpine Mountain, PA Pennsylvania 1150 600 500 3 21 120 0.17 20.4 0.55 66 0.28 33.6 40.08 -76.9
Alpine Valley, MI Michigan 1210 910 300 14 25 118 0.39 46.02 0.26 30.68 0.35 41.3 44.96 -84.87
Alpine Valley, OH Ohio 1500 1260 230 6 11 72 0.33 23.76 0.34 24.48 0.33 23.76 40.31 -83.68
Alpine Valley, WI Wisconsin 388 16 20 90 0.4 36 0.4 36 0.2 18 42.45 -88.25
Alta, UT Utah 10550 8530 2020 11 116 2200 0.25 550 0.4 880 0.35 770 40.6 -111.64
Alyeska, AK Alaska 3939 250 2500 9 73 1500 0.11 165 0.52 780 0.37 555 60.97 -149.11
DESCRIBE TABLE gcs('https://storage.googleapis.com/clickhouse-website/ga-*.parquet', '<access key>', '<secret>')
FORMAT TSV

event_date    Nullable(String)
event_timestamp    Nullable(Int64)
event_name    Nullable(String)
event_params    Array(Tuple(key Nullable(String), value Tuple(string_value Nullable(String), int_value Nullable(Int64), float_value Nullable(Float64), double_value Nullable(Float64))))
event_previous_timestamp    Nullable(Int64)
event_value_in_usd    Nullable(Float64)
/*-----------------------------------------------------------------------*/
/* Program: STREAM */
/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
/* Original code developed by John D. McCalpin */
/* Programmers: John D. McCalpin */
/* Joe R. Zagar */
/* */
/* This program measures memory transfer rates in MB/s for simple */
/* computational kernels coded in C. */
/*-----------------------------------------------------------------------*/