Skip to content

Instantly share code, notes, and snippets.

@prabhant
Created June 7, 2022 15:22
Show Gist options
  • Save prabhant/dfd25b894afbf4d102f7abee23376c41 to your computer and use it in GitHub Desktop.
Save prabhant/dfd25b894afbf4d102f7abee23376c41 to your computer and use it in GitHub Desktop.
translating sparse arff files to sparse parquet files for OpenML
#Code for entire dataset to pyarrow table
import pyarrow as pa
import pyarrow.parquet as pq
# getting the dataset
did=39947
d = openml.datasets.get_dataset(did, download_qualities=False)
df , *_ = d.get_data(dataset_format="dataframe", include_row_id=True, include_ignore_attribute=True)
df = df[[f.name for f in d.features.values()]]
# converting every column to pyarrow array
parr_array = []
names = []
for i in range(len(df.columns)):
arr = np.asarray(df[df.columns[i]])
mask = []
for val in arr:#Mask of null values
if val==df[df.columns[i]].dtype._fill_value:
mask.append(True)
else:
mask.append(False)
mask = np.asarray(mask)
parr = pyarrow.array(arr, from_pandas=True,mask=mask)
parr_array.append(parr)
names.append(df.columns[i])
table = pa.table(parr_array, names=names)
pq.write_table(table, 'df.parquet')
import pandas as pd
df_new = pd.read_parquet('df.parquet')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment