SELECT * FROM dataset
#+RESULTS[6ad7bb7ab69c578be4bc2739ef99eac1d49cce37]:
id | title | description | created | modified | author |
---|---|---|---|---|---|
5eaae02a-df7d-4581-9a6d-00b57b0c69ea | 10K Sales Records | 2020-04-30 14:26:50.069044+00 | 2020-04-30 14:26:50.069044+00 | {“name”: “Jerome Eginla”, “email”: “jerome@t1.akvolumen.org”, “given_name”: “Jerome”, “family_name”: “Eginla”} |
SELECT table_name FROM dataset_version
#+RESULTS[0685eee9cc1b7e301662163a53a8d35e267189f3]:
table_name |
---|
ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c |
SELECT COUNT(*) FROM ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c
#+RESULTS[063e6c80188ed6843250d48e7040edca89099252]:
count |
---|
10000 |
SELECT rnum
FROM (
SELECT *
FROM ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c
TABLESAMPLE BERNOULLI(25.0) --25%
) AS sample
LIMIT 10
#+RESULTS[d5930ec66da80be3ad23738d551d94f5dace13e3]:
rnum |
---|
3 |
8 |
10 |
13 |
14 |
17 |
18 |
21 |
23 |
27 |
SELECT rnum
FROM (
SELECT *
FROM ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c
TABLESAMPLE BERNOULLI(25.0) --25%
) AS sample
LIMIT 10
#+RESULTS[d5930ec66da80be3ad23738d551d94f5dace13e3]:
rnum |
---|
10 |
12 |
15 |
18 |
23 |
26 |
29 |
30 |
42 |
43 |
Every execution returns a different set of rows
We can use a seed
to get the same results
SELECT rnum
FROM (
SELECT *
FROM ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c
TABLESAMPLE BERNOULLI(25.0) --25%
REPEATABLE (42)
) AS sample
LIMIT 10
#+RESULTS[ab76e3a4554d8bd732eb217330ee2c8f3e78c06c]:
rnum |
---|
2 |
3 |
7 |
8 |
12 |
13 |
15 |
20 |
22 |
27 |
One more execution
SELECT rnum
FROM (
SELECT *
FROM ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c
TABLESAMPLE BERNOULLI(25.0) --25%
REPEATABLE (42)
) AS sample
LIMIT 10
#+RESULTS[44d73581c6025b11a6558a98500f8ba163064f38]:
rnum |
---|
2 |
3 |
7 |
8 |
12 |
13 |
15 |
20 |
22 |
27 |
EXPLAIN ANALYZE SELECT * FROM ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c TABLESAMPLE BERNOULLI(25.0)
#+RESULTS[4f8c232b20c0e365207233c27ae9e823544d18a1]:
QUERY PLAN |
---|
Sample Scan on ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c (cost=0.00..227.00 rows=2500 width=121) (actual time=0.008..1.355 rows=2535 loops=1) |
Sampling: bernoulli (‘25’::real) |
Planning time: 0.337 ms |
Execution time: 1.525 ms |
EXPLAIN ANALYZE SELECT * FROM ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c TABLESAMPLE SYSTEM(25.0)
#+RESULTS[d509755f4eca169f4378a1fd8d2936361293f5ad]:
QUERY PLAN |
---|
Sample Scan on ds_aac7c5d0_f289_4b64_ac0b_c957f69af29c (cost=0.00..225.00 rows=2500 width=121) (actual time=0.007..0.517 rows=2425 loops=1) |
Sampling: system (‘25’::real) |
Planning time: 0.340 ms |
Execution time: 0.703 ms |
Not too much differences with a 10K table
curl -s -L -o /tmp/sample.csv https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv
wc -l /tmp/sample.csv
head /tmp/sample.csv
If we want to load it fast we need to use the COPY
protocol.
Since this happens local to the machine running psql
we need to get
the file into the backend
container.
docker-compose exec -T backend \
curl -s -L -o /tmp/sample.csv https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv
docker-compose exec -T backend \
wc -l /tmp/tmp/sample.csv
docker-compose exec -T backend \
head /tmp/tmp/sample.csv
CREATE TABLE sample_data (
c1 TEXT,
c2 TEXT,
c3 TEXT,
c4 TEXT,
c5 TEXT,
c6 TEXT,
c7 TEXT,
c8 TEXT,
c9 TEXT,
c10 TEXT,
c11 TEXT,
c12 TEXT,
c13 TEXT,
c14 TEXT,
c15 TEXT
);
#+RESULTS[1939e5d134a3e8d15ec9715452bdec1fbfdce20f]:
CREATE TABLE |
---|
docker-compose exec -T backend \
psql --username=lumen \
--host=postgres \
--dbname=lumen_tenant_1 \
--no-password \
--command "\copy sample_data FROM '/tmp/sample.csv' WITH DELIMITER ',' CSV HEADER;"
SELECT * FROM sample_data LIMIT 5;
#+RESULTS[d0d42475d74895c591438917f8d1a7ee9496bc03]:
c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 | c9 | c10 | c11 | c12 | c13 | c14 | c15 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6823 | 17-000003 | 2017-01-01 00:00:42 | suspicious | MDC | NA | NO | NO | Unknown | Unknown | Unknown | 44.96661711 | -93.24645826 | 1 | Cedar Riverside |
6824 | 17-000007 | 2017-01-01 00:03:07 | suspicious | MDC | NA | NO | NO | Unknown | Unknown | Male | 44.98045 | -93.27134 | 1 | Downtown West |
6825 | 17-000073 | 2017-01-01 00:23:15 | traffic | MDC | NA | NO | NO | Unknown | White | Female | 44.94835 | -93.27538 | 5 | Whittier |
6826 | 17-000092 | 2017-01-01 00:33:48 | suspicious | MDC | NA | NO | NO | Unknown | East African | Male | 44.94836 | -93.28135 | 5 | Whittier |
6827 | 17-000098 | 2017-01-01 00:37:58 | traffic | MDC | NA | NO | NO | Unknown | White | Female | 44.9790778 | -93.26207579 | 1 | Downtown West |
SELECT COUNT(*) FROM sample_data;
#+RESULTS[ab4ff0f11f95234de0c58415b3ee1a7ce0d59997]:
count |
---|
51920 |
SELECT c1
FROM (
SELECT *
FROM sample_data
TABLESAMPLE BERNOULLI(25.0)
REPEATABLE(200) -- random seed
) AS sample
LIMIT 5
#+RESULTS[c7d0c2f11f7d7390a11f7c1ff8e1fabe5bfde4d9]:
c1 |
---|
6825 |
6828 |
6847 |
6850 |
6851 |
SELECT c1
FROM (
SELECT *
FROM sample_data
TABLESAMPLE BERNOULLI(25.0)
REPEATABLE(200) -- random seed
) AS sample
LIMIT 5
#+RESULTS[c7d0c2f11f7d7390a11f7c1ff8e1fabe5bfde4d9]:
c1 |
---|
6825 |
6828 |
6847 |
6850 |
6851 |