Skip to content

Instantly share code, notes, and snippets.

View cswinter's full-sized avatar

Clemens Winter cswinter

View GitHub Profile
COPY (
SELECT trips.id,
trips.vendor_id,
trips.pickup_datetime,
trips.dropoff_datetime,
trips.store_and_fwd_flag,
trips.rate_code_id,
trips.pickup_longitude,
trips.pickup_latitude,
trips.dropoff_longitude,
This file has been truncated, but you can view the full file.
➜ ~ sudo service clickhouse-server status
● clickhouse-server.service - ClickHouse Server (analytic DBMS for big data)
Loaded: loaded (/etc/systemd/system/clickhouse-server.service; enabled; vendor preset: enabled)
Active: active (running) since Wed 2018-07-04 15:28:31 PDT; 4h 35min ago
Main PID: 1108 (clickhouse-serv)
CGroup: /system.slice/clickhouse-server.service
└─1108 /usr/bin/clickhouse-server --config=/etc/clickhouse-server/config.xml
Jul 04 15:28:31 Archon systemd[1]: Starting ClickHouse Server (analytic DBMS for big data)...
➜ LocustDB git:(master) git pull
Enter passphrase for key '/home/clemens/.ssh/id_rsa':
remote: Counting objects: 17, done.
remote: Compressing objects: 100% (4/4), done.
remote: Total 17 (delta 11), reused 17 (delta 11), pack-reused 0
Unpacking objects: 100% (17/17), done.
From github.com:cswinter/LocustDB
106acb1..a10d938 master -> origin/master
Updating 106acb1..a10d938
Fast-forward
➜ LocustDB git:(master) RUSTFLAGS="-Ccodegen-units=1" CARGO_INCREMENTAL=0 cargo run --features nerf --release --bin repl -- nyc
Finished release [optimized] target(s) in 0.06s
Running `target/release/repl nyc`
NERFED!
Loading nyc into table trips.
Loaded data in 1121 seconds.
# Table `trips` (1464785771 rows, 42794.30 MiB) #
payment_type: 1397.69MiB
pickup_puma: 2611.38MiB
m:(`trip_id ;"I";
`vendor_id ;"*";
`pickup_datetime ;"P";
`dropoff_datetime ;"P";
`store_and_fwd_flag ;"B";
`rate_code_id ;"X";
`pickup_longitude ;"E";
`pickup_latitude ;"E";
`dropoff_longitude ;"E";
`dropoff_latitude ;"E";
#!/bin/sh
ls /home/clemens/Dropbox/nyc-taxi-data-denormalized/* | \
xargs \
-P 6 `# running with -P 12 angers the oom killer` \
-L 1 \
~/q/l64/q load.q \
-q \
-s 2
import sys
for line in sys.stdin:
print ','.join([item if len(item.strip()) else '\N'
for item in line.strip().split(',')])
CREATE TABLE trips (
trip_id UInt32,
vendor_id String,
pickup_datetime DateTime,
dropoff_datetime Nullable(DateTime),
store_and_fwd_flag Nullable(FixedString(1)),
rate_code_id Nullable(UInt8),
pickup_longitude Nullable(Float64),
time (for filename in /home/clemens/Dropbox/nyc-taxi-data-denormalized/trips_x*.csv.gz; do
gunzip -c $filename | \
python trans.py | \
clickhouse-client \
--query="INSERT INTO trips FORMAT CSV"
done)
CREATE TABLE trips_mergetree
ENGINE = MergeTree(pickup_date, pickup_datetime, 8192)
AS SELECT
trip_id,
CAST(vendor_id AS Enum8('1' = 1, '2' = 2, '3' = 3, 'CMT' = 4, 'VTS' = 5, 'DDS' = 6, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14)) AS vendor_id,
toDate(pickup_datetime) AS pickup_date,
ifNull(pickup_datetime, toDateTime(0)) AS pickup_datetime,
toDate(dropoff_datetime) AS dropoff_date,
ifNull(dropoff_datetime, toDateTime(0)) AS dropoff_datetime,
assumeNotNull(store_and_fwd_flag) IN ('Y', '1', '2') AS store_and_fwd_flag,