Gdahuks/meto.py

## meto.py
# This script reformats multiple CSV files containing environmental data
# (see https://gist.github.com/Gdahuks/ae97bebaaff937189a235f9dfecb41e1)
# into a single Parquet table with the date and time as index and the station IDs as columns.

# Input: B00300S_2012_01.csv, B00300S_2012_01.csv, ..., B00300S_2022_12.csv
# 249190890;B00300S;2012-01-01 00:00;-2,9;
# 249190890;B00300S;2012-01-01 00:10;-2,8;
# 249190890;B00300S;2012-01-01 00:20;-2,9;
# 249190890;B00300S;2012-01-01 00:30;-3,3;
# ...

# Output: results.parquet (visualized)
#            datetime  249190890  249190090  249190440 ...
# 2022-05-01 02:00:00        4.6       6.81       3.91 ...
# 2022-05-01 02:10:00        4.1       6.76       4.01 ...
# 2022-05-01 02:20:00        3.9       6.54       3.93 ...
# 2022-05-01 02:30:00        3.6       6.47       3.92 ...
#                 ...        ...        ...        ... ...

import os
import glob
import polars as pl

DATA_DIR = "stations_all/"
RESULT_FILE_NAME = "results.parquet"

csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))

(
    pl.scan_csv(
        csv_files,
        has_header=False,
        separator=";",
        schema={
            "station_id": pl.UInt32,
            "measuerment": pl.String,
            "datetime": pl.Datetime,
            "measurement_value": pl.String,
            "other": pl.String,
        },
    )
    .with_columns(
        pl.col("measurement_value")
        .map_elements(
            lambda element: element.replace(',', '.'),
            return_dtype=pl.String
        )
        .cast(pl.Float32)
    )
    .select(
        [
            pl.col("datetime"),
            pl.col("station_id"),
            pl.col("measurement_value"),
        ]
    )
    .collect()
    .pivot(
        index="datetime",
        columns="station_id",
        values="measurement_value",
        aggregate_function=None
    )
).write_parquet(RESULT_FILE_NAME)
	# This script reformats multiple CSV files containing environmental data
	# (see https://gist.github.com/Gdahuks/ae97bebaaff937189a235f9dfecb41e1)
	# into a single Parquet table with the date and time as index and the station IDs as columns.

	# Input: B00300S_2012_01.csv, B00300S_2012_01.csv, ..., B00300S_2022_12.csv
	# 249190890;B00300S;2012-01-01 00:00;-2,9;
	# 249190890;B00300S;2012-01-01 00:10;-2,8;
	# 249190890;B00300S;2012-01-01 00:20;-2,9;
	# 249190890;B00300S;2012-01-01 00:30;-3,3;
	# ...

	# Output: results.parquet (visualized)
	# datetime 249190890 249190090 249190440 ...
	# 2022-05-01 02:00:00 4.6 6.81 3.91 ...
	# 2022-05-01 02:10:00 4.1 6.76 4.01 ...
	# 2022-05-01 02:20:00 3.9 6.54 3.93 ...
	# 2022-05-01 02:30:00 3.6 6.47 3.92 ...
	# ... ... ... ... ...

	import os
	import glob
	import polars as pl

	DATA_DIR = "stations_all/"
	RESULT_FILE_NAME = "results.parquet"

	csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))

	(
	pl.scan_csv(
	csv_files,
	has_header=False,
	separator=";",
	schema={
	"station_id": pl.UInt32,
	"measuerment": pl.String,
	"datetime": pl.Datetime,
	"measurement_value": pl.String,
	"other": pl.String,
	},
	)
	.with_columns(
	pl.col("measurement_value")
	.map_elements(
	lambda element: element.replace(',', '.'),
	return_dtype=pl.String
	)
	.cast(pl.Float32)
	)
	.select(
	[
	pl.col("datetime"),
	pl.col("station_id"),
	pl.col("measurement_value"),
	]
	)
	.collect()
	.pivot(
	index="datetime",
	columns="station_id",
	values="measurement_value",
	aggregate_function=None
	)
	).write_parquet(RESULT_FILE_NAME)