Skip to content

Instantly share code, notes, and snippets.

@Gabrock94
Created November 22, 2020 13:04
Show Gist options
  • Save Gabrock94/0681a24a840fca983ae8d34f997665a6 to your computer and use it in GitHub Desktop.
Save Gabrock94/0681a24a840fca983ae8d34f997665a6 to your computer and use it in GitHub Desktop.
import os
import pandas as pd
import sklearn as skl
import scipy as sp
import matplotlib.pyplot as plt
# Define the paths
BASEPATH = '/home/giulio/Repositories/covres/' #This is the folder where db is
# import database
df = pd.read_csv(BASEPATH + 'Raw/db.csv')
# This generates the heatmap
corr=df.corr() #This gets the correlation of the dataset
plt.figure(figsize=(10,10)) #change figsize to get a smaller or bigger figure
plt.xticks(range(len(df.columns)),df.columns,rotation=90)
plt.yticks(range(len(df.columns)),df.columns)
plt.imshow(corr, cmap='hot',interpolation="nearest")
plt.colorbar()
#save corr to file
corr.to_csv(BASEPATH + 'Processed/corr.csv')
# This section gets the number of missing values per column
X = []
print("Percentage of Missing values")
for col in df.columns:
X.append([col, len(df[df[col].isnull()]) / len(df) * 100])
missings = pd.DataFrame(X, columns=['Variable', '% Missing'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment