Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import dask.dataframe as dd
ordinal_columns = [
'category_0', 'category_1', 'category_2', 'category_3',
'category_4', 'category_6', 'category_7', 'category_9',
'category_10', 'category_11', 'category_13', 'category_14',
'category_17', 'category_19', 'category_20', 'category_21',
'category_22', 'category_23',
]
onehot_columns = [
'category_5', 'category_8', 'category_12',
'category_15', 'category_16', 'category_18',
'category_24', 'category_25',
]
numeric_columns = [f'numeric_{i}' for i in range(13)]
columns = ['click'] + numeric_columns + onehot_columns + ordinal_columns
def main():
df = dd.read_parquet("data/split/*.parquet", engine="fastparquet",
columns=columns).categorize(columns=onehot_columns)
sample = df.sample(frac=0.10).repartition(npartitions=1000)
categories = ['category_%d' % i for i in range(26)]
encoding = {c: 'bytes' for c in categories}
fixed = {c: 8 for c in categories}
sample.to_parquet("data/sample-10.parquet", object_encoding=encoding,
engine="fastparquet", fixed_text=fixed,
compression="SNAPPY")
if __name__ == "__main__":
main()
from pathlib import Path
import pandas as pd
import dask.dataframe as dd
ordinal_columns = pd.Index([
'category_0',
'category_1',
'category_2',
'category_3',
'category_4',
'category_6',
'category_7',
'category_9',
'category_10',
'category_11',
'category_13',
'category_14',
'category_17',
'category_19',
'category_20',
'category_21',
'category_22',
'category_23',
])
onehot_columns = pd.Index([
'category_5',
'category_8',
'category_12',
'category_15',
'category_16',
'category_18',
'category_24',
'category_25',
])
def main():
categories = ['category_%d' % i for i in range(26)]
columns = ['click'] + ['numeric_%d' % i for i in range(13)] + categories
encoding = {c: 'bytes' for c in categories}
fixed = {c: 8 for c in categories}
chunker = pd.read_csv('data/day_0', sep='\t',
names=columns, header=None,
chunksize=100000,
dtype={col: 'category' for col in onehot_columns})
Path('data/split').mkdir(exist_ok=True)
for i, df in enumerate(chunker):
print(f"Writing, {i:0>6}")
df.to_parquet(f'data/split/{i:0>6}.parquet',
object_encoding=encoding,
engine='fastparquet',
fixed_text=fixed,
compression='SNAPPY')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment