Skip to content

Instantly share code, notes, and snippets.

@tom-clickhouse
Created August 27, 2023 15:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tom-clickhouse/0c1b4d70c4fbebd7a14eb756d1ebc914 to your computer and use it in GitHub Desktop.
Save tom-clickhouse/0c1b4d70c4fbebd7a14eb756d1ebc914 to your computer and use it in GitHub Desktop.
INSERT INTO wikistat WITH
parseDateTimeBestEffort(extract(_file, '^pageviews-([\\d\\-]+)\\.gz$')) AS time,
splitByChar(' ', line) AS values,
splitByChar('.', values[1]) AS projects
SELECT
time,
projects[1] AS project,
projects[2] AS subproject,
decodeURLComponent(values[2]) AS path,
CAST(values[3], 'UInt64') AS hits
FROM s3Cluster('default',
'https://clickhouse-public-datasets.s3.amazonaws.com/wikistat/original/pageviews-202301*.gz',
'LineAsString')
WHERE length(values) >= 3
SETTINGS
max_insert_block_size = 25_000_000,
min_insert_block_size_rows = 25_000_000,
min_insert_block_size_bytes = 500_000_000,
parallel_distributed_insert_select = 2,
max_insert_threads = 30;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment