Skip to content

Instantly share code, notes, and snippets.

@quocnb
Last active January 30, 2018 06:35
Show Gist options
  • Save quocnb/e04baf9e84c9a01929c1ab45d12082d1 to your computer and use it in GitHub Desktop.
Save quocnb/e04baf9e84c9a01929c1ab45d12082d1 to your computer and use it in GitHub Desktop.

NaN Percent

total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Heat map

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Show Relationship

def show_relationship(name, ylim=(0, 8*(10**5))):
    data = pd.concat([df['SalePrice'], df[name]], axis=1)
    data.plot.scatter(x=name, y='SalePrice', ylim=ylim);

Selecting pandas DataFrame Rows Based On Conditions

Testing data

# Import modules
import pandas as pd
import numpy as np

# Create a dataframe
raw_data = {'first_name': ['Jason', 'Molly', np.nan, np.nan, np.nan], 
        'nationality': ['USA', 'USA', 'France', 'UK', 'UK'], 
        'age': [42, 52, 36, 24, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'nationality', 'age'])
df

Method 1 : Using boolean

# Create variable with TRUE if nationality is USA
american = df['nationality'] == "USA"

# Create variable with TRUE if age is greater than 50
elderly = df['age'] > 50

# Select all cases where nationality is USA and age is greater than 50
df[american & elderly]

Method2 : Using variable attributes

# Select all cases where the first name is not missing and nationality is USA 
df[df['first_name'].notnull() & (df['nationality'] == "USA")]

Seaborn Type

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('white')
sns.kdeplot(titanic['Age'], shade=True)
#sns.distplot(titanic['Age'])
sns.despine(left=True, bottom=True)
plt.xlabel('Age')
plt.show()
g = sns.FacetGrid(titanic, col='Pclass', size=6)
g.map(sns.kdeplot, 'Age', shade=True)
sns.despine(left=True, bottom=True)
plt.show()
g = sns.FacetGrid(titanic, col="Survived", row="Pclass")
g.map(sns.kdeplot, "Age", shade=True)
sns.despine(left=True, bottom=True)
plt.show()
g = sns.FacetGrid(titanic, col='Survived', row='Pclass', hue='Sex', size=3, legend_out=True)
g = (g.map(sns.kdeplot, 'Age', shade=True).add_legend())
sns.despine(left=True, bottom=True)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment