Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Example with Dask
import numpy as np
import pandas as pd
from dask import dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count
num_cores = cpu_count()
from datetime import date
def extract_procedure_features(physician_procedures):
one_hot_procedures = pd.get_dummies(physician_procedures.procedure_code, prefix='procedure')
dummied_procedures = pd.concat([physician_procedures.number_of_patients, one_hot_procedures], axis=1)
def numerize(row):
return np.asarray(row.number_of_patients) * np.asarray(row)
numerized_procedures = dd.from_pandas(dummied_procedures, npartitions=num_cores)\
.map_partitions(numerize)\
.compute(get=get)\
.drop('number_of_patients', axis=1)
combined_numerized_procedures = numerized_procedures.assign(physician_id=physician_procedures.physician_id)\
.groupby('physician_id')\
.sum()\
.reset_index()
combined_numerized_procedures.to_csv(processed_data_path)
return combined_numerized_procedures
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment