Skip to content

Instantly share code, notes, and snippets.

@matse004
Last active December 15, 2017 10:25
Show Gist options
  • Save matse004/f9b06818819dd05cb4fe0898708a13cb to your computer and use it in GitHub Desktop.
Save matse004/f9b06818819dd05cb4fe0898708a13cb to your computer and use it in GitHub Desktop.
Convert pandas DataFrames and Series into python dictionaries that can be saved in MongoDB
def convert_to_dict(source):
    if isinstance(source, dict):
        data = {}
        for key, value in source.items():
            new_key = convert_to_dict(key)
            data[new_key] = convert_to_dict(value)
        return data
    elif isinstance(source, (pd.DataFrame, pd.Series)):
        # resetting multiindex levels
        if isinstance(source.index, pd.MultiIndex):
            for i in range(1, source.index.nlevels):
                source = source.reset_index(level=i)

        # Converting all timestamps in index and values to strings of local date format
        if source.index.dtype.str[1].lower() == 'm':
            source.index = source.index.strftime('%x')

        if isinstance(source, pd.Series):
            if source.dtype.str[1].lower() == 'm':
                #source = source.astype(str)
                source = source.dt.strftime('%x')

        if isinstance(source, pd.DataFrame):
            for clmn in source.columns:
                if source[clmn].dtype.str[1].lower() == 'm':
                    source[clmn] = source[clmn].dt.strftime('%x')

                # use same column loop to convert numeric column names to str
                if not isinstance(clmn, str):
                    source = source.rename(columns={clmn: str(clmn)})

        # use pythion None instead of pandas-specific NaT and NaN
        source = source.replace([np.nan], [None])

        # use basic float type for all numeric data
        source = source.astype(dtype=float, errors='ignore')

        # converting to dictionaries
        if isinstance(source, pd.DataFrame):
            source = source.to_dict(orient='list')
        if isinstance(source, pd.Series):
            source = source.to_dict()
        return source
        
        
    elif isinstance(source, pd.Timestamp):
        return source.to_pydatetime()
    elif isinstance(source, np.int64):
        return np.asscalar(source)
    elif isinstance(source, np.ndarray):
        return source.tolist()
    else:
        return source
@matse004
Copy link
Author

Couldn't find a nice native solution;
odo doesn't deal with floats correctly, also has issues with networkx dependency

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment