Skip to content

Instantly share code, notes, and snippets.

@Gdahuks
Last active March 20, 2024 13:44
Show Gist options
  • Save Gdahuks/64844bf5950e5be95a274f9266a1817c to your computer and use it in GitHub Desktop.
Save Gdahuks/64844bf5950e5be95a274f9266a1817c to your computer and use it in GitHub Desktop.
# This script reformats multiple CSV files containing environmental data
# (see https://gist.github.com/Gdahuks/ae97bebaaff937189a235f9dfecb41e1)
# into a single Parquet table with the date and time as index and the station IDs as columns.
# Input: B00300S_2012_01.csv, B00300S_2012_01.csv, ..., B00300S_2022_12.csv
# 249190890;B00300S;2012-01-01 00:00;-2,9;
# 249190890;B00300S;2012-01-01 00:10;-2,8;
# 249190890;B00300S;2012-01-01 00:20;-2,9;
# 249190890;B00300S;2012-01-01 00:30;-3,3;
# ...
# Output: results.parquet (visualized)
# datetime 249190890 249190090 249190440 ...
# 2022-05-01 02:00:00 4.6 6.81 3.91 ...
# 2022-05-01 02:10:00 4.1 6.76 4.01 ...
# 2022-05-01 02:20:00 3.9 6.54 3.93 ...
# 2022-05-01 02:30:00 3.6 6.47 3.92 ...
# ... ... ... ... ...
import os
import glob
import polars as pl
DATA_DIR = "stations_all/"
RESULT_FILE_NAME = "results.parquet"
csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
(
pl.scan_csv(
csv_files,
has_header=False,
separator=";",
schema={
"station_id": pl.UInt32,
"measuerment": pl.String,
"datetime": pl.Datetime,
"measurement_value": pl.String,
"other": pl.String,
},
)
.with_columns(
pl.col("measurement_value")
.map_elements(
lambda element: element.replace(',', '.'),
return_dtype=pl.String
)
.cast(pl.Float32)
)
.select(
[
pl.col("datetime"),
pl.col("station_id"),
pl.col("measurement_value"),
]
)
.collect()
.pivot(
index="datetime",
columns="station_id",
values="measurement_value",
aggregate_function=None
)
).write_parquet(RESULT_FILE_NAME)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment