Skip to content

Instantly share code, notes, and snippets.

@Proteusiq
Created March 27, 2021 06:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Proteusiq/21d0b45974be24dde9ba5c4dfc8b2e70 to your computer and use it in GitHub Desktop.
Save Proteusiq/21d0b45974be24dde9ba5c4dfc8b2e70 to your computer and use it in GitHub Desktop.
from pathlib import Path
import pandas as pd
DATA_FOLDER = Path("data")
# get all csv files in data and its subfolders
for csv_file in DATA_FOLDER.rglob("*.csv"):
csv_full_path = csv_file.resolve()
# read csv file to dataf and compress
dataf = pd.read_csv(csv_full_path)
dataf.to_csv(f"{csv_full_path}.bz2",
compression='bz2',
index=False)
# delete the uncompressed csv file
csv_file.unlick()
# Note ##########################################
# read with pandas: #
# dataf = pd.read_csv( #
# 'file.csv.gzip', compression='bz2') #
# compression: #
# 'bz2' > 'xz' > 'gzip' #
# when dealing with floats type data #
# only #
# 'xz' > 'bz2'> 'gzip' #
# when dealing with mixed data types. #
# 'xz' takes longer writing and reading. #
# #
#################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment