Created
May 25, 2022 14:23
-
-
Save jpigla/44ce25363ec9ded68243d8f7c2805dbd to your computer and use it in GitHub Desktop.
Cheat Sheet Python Pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# === Pandas ================================================================================================================== | |
# === Pandas / Module + Settings ============================================================================================== | |
# Import Dataframe library | |
import pandas as pd | |
# Pandas Settings for better output in Jupyter Notebooks | |
pd.set_option('display.max_colwidth', None) | |
pd.set_option('display.max_rows', 100) | |
pd.set_option('display.max_columns', 50) | |
# === Pandas / DataFrame ====================================================================================================== | |
# Create empty DataFrame | |
df = pd.DataFrame() | |
df = pd.DataFrame(None) | |
# Read CSV file into DataFrame | |
df = pd.DataFrame(pd.read_csv('data.csv')) | |
df = pd.read_csv('data.csv') | |
skiprows = 5 # Skip first 5 rows | |
header = 2 # row number which contains the column names | |
sep = '\t' # Separator (default ',') | |
index_col = 0 # column to use as index | |
encoding = 'latin-1' # Set encoding for import | |
name = ['column'] # Rename columns (new name) | |
engine = 'python' # Engine to use for import | |
dtype = {'Name': str, 'Grade': int} # Specify data type for columns | |
usecols = [0, 1] # Columns to import | |
nrows = 10 # Number of rows to import | |
na_values = ['NA'] # Values for NA to use for import | |
mode = 'a' # Mode to use for import (a: append, overwrite, etc.) | |
# === Pandas / Styling ======================================================================================================== | |
# Sets text align to left for single DataFrame | |
df = df.style.set_properties(**{'text-align': 'left'}) | |
# === Pandas / Basic Data Information ========================================================================================= | |
# Create Statistics for DataFrame (mean, std, min, max, etc.) & inklude strings | |
df.describe(include='all') | |
include = 'all' / None # Include strings or numeric values only | |
# Get number of rows and columns | |
df.shape | |
df.shape[0] # Number of rows | |
df.shape[1] # Number of columns | |
# Get distinct values in column | |
df['column'].unique() | |
# Get count of distinct values in column | |
df['column'].value_counts() | |
# Get list of columns | |
df.columns | |
# Get information about DataFrame | |
df.info() | |
# Get count of non-NA values in DataFrame | |
df.count() | |
df.count(axis=1) # Get count of columns in DataFrame | |
df.count(axis=0) # Get count of rows in column | |
len(df) # Get count of rows in DataFrame | |
# Get number of rows with each unique value in column | |
df['column'].value_counts() | |
# === Pandas / Data Summarising =============================================================================================== | |
# Get minimum / maximum value of column | |
df['column'].min() | |
df['column'].max() | |
# Get (cumulative) mean of column | |
df['column'].mean() | |
df['column'].cumsum() / df['column'].count() | |
# Get median of column | |
df['column'].median() | |
# Get mode of column | |
df['column'].mode() | |
# Get (cumulative) sum of column | |
df['column'].sum() | |
df['column'].cumsum() | |
# Get standard deviation of column | |
df['column'].std() | |
# Get variance of column | |
df['column'].var() | |
# Get skewness of column | |
df['column'].skew() | |
# Get kurtosis of column | |
df['column'].kurt() | |
# Get quantile of column | |
df['column'].quantile(0.5) | |
# Get minimum / maximum index value of column | |
df['column'].idxmin() | |
df['column'].idxmax() | |
# === Pandas / Row Manipulation =============================================================================================== | |
# Delete duplicated Rows by selected column (inplace) | |
df.drop_duplicates(subset=['column'], keep="first", inplace=True) | |
df.drop_duplicates(subset=['column', 'column2'], keep="first", inplace=True) | |
keep = 'first' / 'last' / False # Keep first / last / no duplicates | |
# === Pandas / Column Manipulation ============================================================================================ | |
# Delete selected Columns (inplace) | |
df.drop(columns=['column'], inplace=True) | |
# === Pandas / Data Extraction ================================================================================================ | |
# Extract strings from column | |
df['column'].str.findall(r'(.*?)') | |
# === Pandas / Data Extraction / Examples ===================================================================================== | |
# Extract strings from column and concate strings with '@@ | |
df['column'].str.findall(r'(.*?)').apply('@@'.join) | |
# === Pandas / Data Sorting =================================================================================================== | |
# Sort DataFrame by column (inplace) | |
df.sort_values('column1', ascending=True, inplace=True) | |
df.sort_values(['column1', 'column2'], ascending=[True, True], inplace=True) | |
# === Pandas / Data Filtering ================================================================================================= | |
# Groups Data in column (unique) and keep top 5 values | |
df.groupby(['column']).head(5) | |
# Select columns by name with regex | |
df.filter(regex='.*string.*') | |
# Select columns by dtype (include / exclude) | |
df.select_dtypes(include='number', exclude='object') | |
# === Pandas / Applying Functions ============================================================================================= | |
# Apply function to column | |
df['column'].apply(lambda x: x + 1) | |
# Apply function element-wise to column | |
df['column'].applymap(lambda x: x + 1) | |
# === Pandas / Handling Missing Data ========================================================================================== | |
# Replace missing values with 0 | |
df['column'].fillna(0) | |
df['column'].fillna(np.nan) # Replace missing values with NaN | |
df['column'].fillna(df['column'].mean()) # Replace missing values with mean | |
df['column'].fillna({'column': 'value'}, inplace=True) # Replace missing values with column specific value | |
df['column'].fillna(method='ffill') # Replace missing values with previous value | |
df['column'].fillna(method='bfill') # Replace missing values with next value | |
method = 'ffill' / 'bfill' / 'pad' / 'backfill' / None # Method to use for filling missing values (default: 'ffill'; 'ffill' / 'pad': previous value, 'bfill' / 'backfill': next value) | |
# Drop rows (all columns) with missing values | |
df.dropna() | |
df.dropna(subset=['column']) # Drop rows (selected columns) with missing values | |
how = 'any' / 'all' # How to drop rows (any: drop rows if any column has missing values, all: drop rows if all columns have missing values) | |
# Keep rows (all columns) where rows of selected column are not NaN | |
df[df['column'].notnull()] | |
df[df['column'].notna()] | |
# === Pandas / Cleaning Data ================================================================================================== | |
# Replace old string with new string | |
df['column'].str.replace({'old string', 'new string'}) | |
df['column'].str.replace({'old string', 'new string'}, case=False) # case insensitive | |
# Replace string (regex) with new string | |
df['column'].str.replace(r'^www.', 'https://www.', regex=True) | |
# Remove white spaces from string | |
df['column'].str.strip() | |
# === Pandas / Merging Data =================================================================================================== | |
# Merge DataFrames on column | |
pd.concat([df, df], axis=1) | |
pd.concat([df, df], axis=1, join='inner') # Inner join | |
# Merge DataFrames only on selected columns | |
pd.merge(df, df, on='column', how='inner') | |
pd.merge(df, df[['column', 'column1']], on='column', how='inner') # Merge on selected columns with selected columns | |
# === Pandas / Convert Data =================================================================================================== | |
# Convert DataFrame to list | |
df.to_list() | |
# Make DataFrame from list | |
pd.DataFrame([{'column': 'value'}]) | |
# Convert DataFrame to JSON | |
df.to_json() | |
# Convert JSON to DataFrame | |
pd.read_json('path/to/json') | |
# Round values in DataFrame (.00) | |
df.round(2) | |
df.round({'column': 2, 'column1': 3}) # Round values in selected columns | |
# === Pandas / String Manipulation ============================================================================================ | |
# Convert string to ... | |
df['column'].str.lower() # All characters to lower case | |
df['column'].str.upper() # All characters to upper case | |
df['column'].str.title() # Title case (first letter of each word capitalized) | |
df['column'].str.capitalize() # Capitalize first letter of each word | |
df['column'].str.swapcase() # Swap case of each letter (upper to lower, lower to upper) | |
df['column'].str.casefold() # Case folding (lower case) | |
# === Pandas / Examples ======================================================================================================= | |
# Groups GSC daily data into weekly intervals from Monday to Sunday (right closed / included) | |
df.resample('W-SUN', on='date').agg({'clicks':'sum','impressions':'sum','ctr':'mean','position':'mean'}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment