The following assumes use of clickhouse-local
, although these steps will alsowork with ClickHouse server.
CREATE TABLE uk_price_paid
(
`price` UInt32,
`date` Date,
`postcode1` LowCardinality(String),
`postcode2` LowCardinality(String),
`type` Enum8('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
`is_new` UInt8,
`duration` Enum8('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
`addr1` String,
`addr2` String,
`street` LowCardinality(String),
`locality` LowCardinality(String),
`town` LowCardinality(String),
`district` LowCardinality(String),
`county` LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2)
INSERT INTO uk_price_paid
SETTINGS max_http_get_redirects = 10
WITH splitByChar(' ', postcode) AS p
SELECT
toUInt32(price_string) AS price,
parseDateTimeBestEffortUS(time) AS date,
p[1] AS postcode1,
p[2] AS postcode2,
transform(a, ['T', 'S', 'D', 'F', 'O'], ['terraced', 'semi-detached', 'detached', 'flat', 'other']) AS type,
b = 'Y' AS is_new,
transform(c, ['F', 'L', 'U'], ['freehold', 'leasehold', 'unknown']) AS duration,
addr1,
addr2,
street,
locality,
town,
district,
county
FROM url('http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv', 'CSV', 'uuid_string String,\n price_string String,\n time String,\n postcode String,\n a String,\n b String,\n c String,\n addr1 String,\n addr2 String,\n street String,\n locality String,\n town String,\n district String,\n county String,\n d String,\n e String')
SETTINGS max_http_get_redirects = 10
Note: the following utilises the file
function in ClickHouse. For clickhouse-local
files will be output to the local file server (paths can be specified).
For ClickHouse Server files be written to the server filesystem under the configured user_files_path
. Users wishing to write to the client file system can adjust the following to use the INTO OUTFILE
clause.
Either, write a single file:
INSERT INTO FUNCTION file('house_prices.csv') SELECT * FROM uk_price_paid
or as multiple files using PARTITION BY
e.g. by Year:
INSERT INTO FUNCTION file('house_prices_{_partition_id}.parquet', 'Parquet') PARTITION BY toYear(date) SELECT * FROM uk_price_paid