Skip to content

Instantly share code, notes, and snippets.

@fulibacsi
Last active August 22, 2019 13:31
Show Gist options
  • Save fulibacsi/158d4d41a1fb77183d6133a169398caa to your computer and use it in GitHub Desktop.
Save fulibacsi/158d4d41a1fb77183d6133a169398caa to your computer and use it in GitHub Desktop.
Generate random pandas DataFrame
import datetime
import numpy as np
import pandas as pd
from pandas.util.testing import makeIntIndex, rands_array
class RandomDataFrame:
def __init__(self, size, nintcols=1, nfloatcols=0,
nstringcols=0, ndatecols=0):
self.size = size
self.nintcols = nintcols
self.maxintvalue = 5000
self.nfloatcols = nfloatcols
self.nstringcols = nstringcols
self.nchars = 10
self.ndatecols = ndatecols
def generate(self):
index = makeIntIndex(self.size)
istr = 'int_{}'
int_cols = {istr.format(i): self.int_col(index, istr.format(i))
for i in range(self.nintcols)}
fstr = 'float_{}'
float_cols = {fstr.format(i): self.float_col(index, fstr.format(i))
for i in range(self.nfloatcols)}
sstr = 'str_{}'
string_cols = {sstr.format(i): self.str_col(index, sstr.format(i))
for i in range(self.nstringcols)}
dstr = 'date_{}'
date_cols = {dstr.format(i): self.date_col(index, dstr.format(i))
for i in range(self.ndatecols)}
columns = {}
for cols in [int_cols, float_cols, string_cols, date_cols]:
columns.update(cols)
return pd.DataFrame(columns, index=index)
def int_col(self, index, name=None):
return pd.Series(np.random.randint(self.maxintvalue, size=self.size),
index=index, name=name)
def float_col(self, index, name=None):
return pd.Series(np.random.rand(self.size), index=index, name=name)
def str_col(self, index, name=None):
return pd.Series(rands_array(self.nchars, self.size),
index=index, name=name)
def date_col(self, index, name=None):
year = np.random.randint(1980, 2020, self.size)
month = np.random.randint(1, 12, self.size)
day = np.random.randint(1, 28, self.size)
dt = pd.to_datetime({'year': year, 'month': month, 'day': day})
return pd.Series(dt, index=index, name=name)
def random_df(size, nintcols=1, nfloatcols=0, nstringcols=0, ndatecols=0):
rdf = RandomDataFrame(size, nintcols, nfloatcols, nstringcols, ndatecols)
return rdf.generate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment