Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Last active September 18, 2018 18:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TomAugspurger/4a058f00b32fc049ab5f2860d03fd579 to your computer and use it in GitHub Desktop.
Save TomAugspurger/4a058f00b32fc049ab5f2860d03fd579 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import dask.dataframe as dd
ordinal_columns = [
'category_0', 'category_1', 'category_2', 'category_3',
'category_4', 'category_6', 'category_7', 'category_9',
'category_10', 'category_11', 'category_13', 'category_14',
'category_17', 'category_19', 'category_20', 'category_21',
'category_22', 'category_23',
]
onehot_columns = [
'category_5', 'category_8', 'category_12',
'category_15', 'category_16', 'category_18',
'category_24', 'category_25',
]
numeric_columns = [f'numeric_{i}' for i in range(13)]
columns = ['click'] + numeric_columns + onehot_columns + ordinal_columns
def main():
df = dd.read_parquet("data/split/*.parquet", engine="fastparquet",
columns=columns).categorize(columns=onehot_columns)
sample = df.sample(frac=0.10).repartition(npartitions=1000)
categories = ['category_%d' % i for i in range(26)]
encoding = {c: 'bytes' for c in categories}
fixed = {c: 8 for c in categories}
sample.to_parquet("data/sample-10.parquet", object_encoding=encoding,
engine="fastparquet", fixed_text=fixed,
compression="SNAPPY")
if __name__ == "__main__":
main()
from pathlib import Path
import pandas as pd
import dask.dataframe as dd
ordinal_columns = pd.Index([
'category_0',
'category_1',
'category_2',
'category_3',
'category_4',
'category_6',
'category_7',
'category_9',
'category_10',
'category_11',
'category_13',
'category_14',
'category_17',
'category_19',
'category_20',
'category_21',
'category_22',
'category_23',
])
onehot_columns = pd.Index([
'category_5',
'category_8',
'category_12',
'category_15',
'category_16',
'category_18',
'category_24',
'category_25',
])
def main():
categories = ['category_%d' % i for i in range(26)]
columns = ['click'] + ['numeric_%d' % i for i in range(13)] + categories
encoding = {c: 'bytes' for c in categories}
fixed = {c: 8 for c in categories}
chunker = pd.read_csv('data/day_0', sep='\t',
names=columns, header=None,
chunksize=100000,
dtype={col: 'category' for col in onehot_columns})
Path('data/split').mkdir(exist_ok=True)
for i, df in enumerate(chunker):
print(f"Writing, {i:0>6}")
df.to_parquet(f'data/split/{i:0>6}.parquet',
object_encoding=encoding,
engine='fastparquet',
fixed_text=fixed,
compression='SNAPPY')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment