stuntgoat/pandas_notes.py

## pandas_notes.py
# read csv
df = pd.read_csv("<filename>")

# show duplicates; returns series of bools
dup_indexes = r.duplicated(cols=("colname1", "colname2"))

# remove duplicates; in place
df.drop_duplicates("colname1", "colname2", inplace=True)

# drop a column
del df['my_col_name']

# rename row items in a column; `f` is a function that takes a
# value and renames it based on some condition
df.NAMES = df.NAMES.map(f)

# count unique items in a row
df.NAMES.value_counts()

# subset with map
df_subset = df[df.NAMES.map(bool_func)]

# select rows by condition
df_subset = df[df.VALUES > 100]
# multiple conditions
df_subset = df[(df.VALUES > 100) & (df.OTHER == True)]

# remove rows with function that checks for row values;
# `has_good_value` takes a row and an index of the row to check(`row_idx`) and returns a bool.
df_new = df.apply(has_good_value, axis=1, row_idx=0)

# convert datetime64 index to iso column
df['isodatetime'] = df.index.format(formatter=lambda x:x.isoformat())

# sort a TimeSeriesIndex
df.reindex(df.index.values.sort())

# Drop a row by index; inplace
df.drop(df.index[3109], inplace=True)

# Convert a column data type
df['my_colname'] = df['my_colname'].astype(float)

# Create a histogram from a column; using a bin width of 5
df.my_colname.hist(bins=range(0, 100, 5))
plt.show()
# Then clear figure
plt.clf()

# Append row to dataframe
df = pd.DataFrame(columns=range(10))
df.loc[0] = np.random.randn(10)

# Convert ISO string formatted times to datetimes
df.times = df.times.map(pd.to_datetime)
#  Create a DateTimeIndex from datetimes
df.index = pd.TimeSeries(df.times)
# If the times are not sorted you'll wanna call sort_index- returns a copy
df = df.sort_index()

# Extract weekday from Timeseries index
df['weekday'] = df.index.map(lambda x: x.strftime('%A'))

# Resample time series index data by hour and sum of rows between intervals
df = df.resample('H', how='sum')

# Convert UTC timeindex without timezone to US/Eastern
df.index = df.index.tz_localize('UTC')  # localize index to UTC
df.index = df.index.tz_convert('US/Eastern')  # convert

# Edit multiple values in place with boolean vector indexing.
# in this case, NaN values are set to 0.
df.ix[df.SOME_NUMBERS.isnull()] = 0

# Set output options in terminal
pd.set_option('display.max_rows', 1000)
# pd.set_option('max_columns', 100)
pd.set_option('display.max_columns', 400) # default is 80
pd.set_option('display.width', 1000) # default is 80

# Info
pd.describe_option('display')
# Turn off wrap
pd.set_option('expand_frame_repr', False)

# numpy settings; sort of related
np.set_printoptions(threshold=5000)
	# read csv
	df = pd.read_csv("<filename>")

	# show duplicates; returns series of bools
	dup_indexes = r.duplicated(cols=("colname1", "colname2"))

	# remove duplicates; in place
	df.drop_duplicates("colname1", "colname2", inplace=True)

	# drop a column
	del df['my_col_name']

	# rename row items in a column; `f` is a function that takes a
	# value and renames it based on some condition
	df.NAMES = df.NAMES.map(f)

	# count unique items in a row
	df.NAMES.value_counts()

	# subset with map
	df_subset = df[df.NAMES.map(bool_func)]

	# select rows by condition
	df_subset = df[df.VALUES > 100]
	# multiple conditions
	df_subset = df[(df.VALUES > 100) & (df.OTHER == True)]

	# remove rows with function that checks for row values;
	# `has_good_value` takes a row and an index of the row to check(`row_idx`) and returns a bool.
	df_new = df.apply(has_good_value, axis=1, row_idx=0)

	# convert datetime64 index to iso column
	df['isodatetime'] = df.index.format(formatter=lambda x:x.isoformat())

	# sort a TimeSeriesIndex
	df.reindex(df.index.values.sort())

	# Drop a row by index; inplace
	df.drop(df.index[3109], inplace=True)

	# Convert a column data type
	df['my_colname'] = df['my_colname'].astype(float)

	# Create a histogram from a column; using a bin width of 5
	df.my_colname.hist(bins=range(0, 100, 5))
	plt.show()
	# Then clear figure
	plt.clf()

	# Append row to dataframe
	df = pd.DataFrame(columns=range(10))
	df.loc[0] = np.random.randn(10)

	# Convert ISO string formatted times to datetimes
	df.times = df.times.map(pd.to_datetime)
	# Create a DateTimeIndex from datetimes
	df.index = pd.TimeSeries(df.times)
	# If the times are not sorted you'll wanna call sort_index- returns a copy
	df = df.sort_index()

	# Extract weekday from Timeseries index
	df['weekday'] = df.index.map(lambda x: x.strftime('%A'))

	# Resample time series index data by hour and sum of rows between intervals
	df = df.resample('H', how='sum')

	# Convert UTC timeindex without timezone to US/Eastern
	df.index = df.index.tz_localize('UTC') # localize index to UTC
	df.index = df.index.tz_convert('US/Eastern') # convert

	# Edit multiple values in place with boolean vector indexing.
	# in this case, NaN values are set to 0.
	df.ix[df.SOME_NUMBERS.isnull()] = 0

	# Set output options in terminal
	pd.set_option('display.max_rows', 1000)
	# pd.set_option('max_columns', 100)
	pd.set_option('display.max_columns', 400) # default is 80
	pd.set_option('display.width', 1000) # default is 80

	# Info
	pd.describe_option('display')
	# Turn off wrap
	pd.set_option('expand_frame_repr', False)

	# numpy settings; sort of related
	np.set_printoptions(threshold=5000)