Skip to content

Instantly share code, notes, and snippets.

@LouisdeBruijn
Last active June 14, 2021 16:55
Show Gist options
  • Save LouisdeBruijn/e75f0d448cfce8cc0c8b2f9f04578202 to your computer and use it in GitHub Desktop.
Save LouisdeBruijn/e75f0d448cfce8cc0c8b2f9f04578202 to your computer and use it in GitHub Desktop.
import pandas as pd
from typing import Dict, List, Optional, Sequence, Tuple, Union
class DataProcessor:
"""Read and process data."""
@classmethod
def train_test_sets(
cls,
train_paths: Sequence[Union[Path, str]],
test_paths: Optional[Sequence[Union[Path, str]]] = None,
duplicate_threshold: int = 3,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Create train and test Pandas DataFrames while avoiding leakage-flow in train/test."""
train_df: pd.DataFrame = DataProcessor.concatenate_datasets(train_paths)
test_df: Optional[pd.DataFrame] = None
if test_paths:
test_df = DataProcessor.concatenate_datasets(test_paths)
return train_df, test_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment