Skip to content

Instantly share code, notes, and snippets.

@avriiil
Created November 14, 2022 15:19
Show Gist options
  • Save avriiil/2bf65bfcecaa66d5d870a502ad28e1ec to your computer and use it in GitHub Desktop.
Save avriiil/2bf65bfcecaa66d5d870a502ad28e1ec to your computer and use it in GitHub Desktop.
Download Kaggle NYC bike data for Dask benchmarking
import json
from pathlib import Path
import duckdb
import kaggle
from loguru import logger
from tqdm import tqdm
# In order to access this data, you must create a Kaggle account and obtain an API key.
# You can obtain a key by clicking on your icon on the upper right of the homepage,
# clicking Account, and then clicking to create a new API token.
# This will allow you to download a kaggle.json credentials file which contains the API key.
# Move the kaggle.json file that you downloaded during the Kaggle setup to ~/.kaggle/kaggle.json.
def csv_to_parquet(infile: str, outfile: str, sample_size=512_000) -> None:
conn = duckdb.connect(":memory:")
# We cannot autodetect columns due to some messiness in the data. For example,
# station_id is mostly an integer except for some weird stations that have a long
# UUID-esque ID. These types of stations do not appear in every file.
columns = {
"station_id": "VARCHAR",
"num_bikes_available": "INTEGER",
"num_ebikes_available": "VARCHAR",
"num_bikes_disabled": "VARCHAR",
"num_docks_available": "INTEGER",
"num_docks_disabled": "VARCHAR",
"is_installed": "VARCHAR",
"is_renting": "VARCHAR",
"is_returning": "VARCHAR",
"station_status_last_reported": "INTEGER",
"station_name": "VARCHAR",
"lat": "VARCHAR",
"lon": "VARCHAR",
"region_id": "VARCHAR",
"capacity": "VARCHAR",
"has_kiosk": "VARCHAR",
"station_information_last_updated": "VARCHAR",
"missing_station_information": "BOOLEAN",
}
# Format columns correctly for the duckdb query.
# Replace double-quotes with single-quotes.
columns = json.dumps(columns, indent=2).replace('"', "'")
query = f"""
COPY (
SELECT *
FROM read_csv(
'{infile}',
columns={columns},
sample_size={sample_size},
header=True,
auto_detect=False
)
)
TO '{outfile}'
(FORMAT 'PARQUET');
"""
conn.execute(query)
def construct_dataset(root: Path, kaggle_dataset_name: str) -> None:
# Download the dataset from Kaggle
logger.info("Downloading {!r} dataset from kaggle.", kaggle_dataset_name)
download_path = root / "csv"
download_path.mkdir(parents=True, exist_ok=True)
kaggle.api.dataset_download_files(kaggle_dataset_name, download_path, unzip=True)
logger.info("Converting dataset from CSV to parquet.")
# Convert the dataset to parquet
parquet_path = root / "parquet"
parquet_path.mkdir(parents=True, exist_ok=True)
csvs = list(download_path.iterdir())
for csv in tqdm(csvs):
outfile = parquet_path / csv.with_suffix(".parquet").name
csv_to_parquet(csv.as_posix(), outfile.as_posix())
# generate dataset
dataset = construct_dataset(Path("~/Desktop/data_kaggle"), "rosenthal/citi-bike-stations")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment