Skip to content

Instantly share code, notes, and snippets.

@jelford
Last active April 21, 2023 14:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jelford/f599a8f596d949ff356a222325d33cd2 to your computer and use it in GitHub Desktop.
Save jelford/f599a8f596d949ff356a222325d33cd2 to your computer and use it in GitHub Desktop.
Read Numerai dataset by Era
import pyarrow.dataset as ds
def load(source_file: str, eras=None, features=None) -> pd.DataFrame:
source_file = str(source_file)
if eras is None:
eras = eras_from_file(source_file)
if features is None:
features = feature_names(source_file)
tables = []
dataset = ds.dataset(source_file, exclude_invalid_files=True)
features_to_load_at_a_time=200
for findex in range(0, len(features), features_to_load_at_a_time):
features_to_load = features[findex:findex + features_to_load_at_a_time]
tables.append(dataset.to_table(columns=features_to_load + ['id'], filter=ds.field("era").isin(eras)).to_pandas())
df = pd.concat(tables, axis=1)
dtypes = {f: np.float_ for f in features}
if 'era' in features:
dtypes['era'] = object
df = df.astype(dtype=dtypes)
df.index.name = 'id'
return df
def eras_from_file(source_file: str) -> list[str]:
dataset = ds.dataset(source_file, exclude_invalid_files=True)
eras = set(e.as_py() for batch in dataset.to_table(columns=['era']) for e in batch)
return sorted(eras)
def feature_names(source_file: str) -> list[str]:
# dataset interface rather than parquet because it infers s3 filesystem
sch = ds.dataset(source_file).schema
return sorted(s for s in sch.names if s.startswith('feature'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment