Skip to content

Instantly share code, notes, and snippets.

@stuntgoat
Last active March 8, 2016 01:45
Show Gist options
  • Save stuntgoat/7150564 to your computer and use it in GitHub Desktop.
Save stuntgoat/7150564 to your computer and use it in GitHub Desktop.
notes on using Pandas
# read csv
df = pd.read_csv("<filename>")
# show duplicates; returns series of bools
dup_indexes = r.duplicated(cols=("colname1", "colname2"))
# remove duplicates; in place
df.drop_duplicates("colname1", "colname2", inplace=True)
# drop a column
del df['my_col_name']
# rename row items in a column; `f` is a function that takes a
# value and renames it based on some condition
df.NAMES = df.NAMES.map(f)
# count unique items in a row
df.NAMES.value_counts()
# subset with map
df_subset = df[df.NAMES.map(bool_func)]
# select rows by condition
df_subset = df[df.VALUES > 100]
# multiple conditions
df_subset = df[(df.VALUES > 100) & (df.OTHER == True)]
# remove rows with function that checks for row values;
# `has_good_value` takes a row and an index of the row to check(`row_idx`) and returns a bool.
df_new = df.apply(has_good_value, axis=1, row_idx=0)
# convert datetime64 index to iso column
df['isodatetime'] = df.index.format(formatter=lambda x:x.isoformat())
# sort a TimeSeriesIndex
df.reindex(df.index.values.sort())
# Drop a row by index; inplace
df.drop(df.index[3109], inplace=True)
# Convert a column data type
df['my_colname'] = df['my_colname'].astype(float)
# Create a histogram from a column; using a bin width of 5
df.my_colname.hist(bins=range(0, 100, 5))
plt.show()
# Then clear figure
plt.clf()
# Append row to dataframe
df = pd.DataFrame(columns=range(10))
df.loc[0] = np.random.randn(10)
# Convert ISO string formatted times to datetimes
df.times = df.times.map(pd.to_datetime)
# Create a DateTimeIndex from datetimes
df.index = pd.TimeSeries(df.times)
# If the times are not sorted you'll wanna call sort_index- returns a copy
df = df.sort_index()
# Extract weekday from Timeseries index
df['weekday'] = df.index.map(lambda x: x.strftime('%A'))
# Resample time series index data by hour and sum of rows between intervals
df = df.resample('H', how='sum')
# Convert UTC timeindex without timezone to US/Eastern
df.index = df.index.tz_localize('UTC') # localize index to UTC
df.index = df.index.tz_convert('US/Eastern') # convert
# Edit multiple values in place with boolean vector indexing.
# in this case, NaN values are set to 0.
df.ix[df.SOME_NUMBERS.isnull()] = 0
# Set output options in terminal
pd.set_option('display.max_rows', 1000)
# pd.set_option('max_columns', 100)
pd.set_option('display.max_columns', 400) # default is 80
pd.set_option('display.width', 1000) # default is 80
# Info
pd.describe_option('display')
# Turn off wrap
pd.set_option('expand_frame_repr', False)
# numpy settings; sort of related
np.set_printoptions(threshold=5000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment