Skip to content

Instantly share code, notes, and snippets.

View emredjan's full-sized avatar

Emre Can emredjan

View GitHub Profile
@emredjan
emredjan / cactivate.ps1
Last active November 22, 2018 13:51
Activate conda environment in PowerShell
Param
(
[parameter(Position=0, Mandatory=$true)]
[String]
$env_name
)
# simulate activation of Conda environment
$cenv = "C:\Path\to\anaconda\installation\envs\$env_name"
$Env:CONDA_PREFIX = $cenv
@emredjan
emredjan / json_to_csv.py
Last active October 21, 2022 01:46
Yelp Dataset Challenge JSON to CSV conversion
'''
Load Yelp JSON files and spit out CSV files
Does not try to reinvent the wheel and uses pandas json_normalize
Kinda hacky and requires a bit of RAM. But works, albeit naively.
Tested with Yelp JSON files in dataset challenge round 12:
https://www.yelp.com/dataset/challenge
'''
import json
plot_lm_4 = plt.figure(4)
plot_lm_4.set_figheight(8)
plot_lm_4.set_figwidth(12)
plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
sns.regplot(model_leverage, model_norm_residuals,
scatter=False,
ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_3 = plt.figure(3)
plot_lm_3.set_figheight(8)
plot_lm_3.set_figwidth(12)
plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
sns.regplot(model_fitted_y, model_norm_residuals_abs_sqrt,
scatter=False,
ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
QQ = ProbPlot(model_norm_residuals)
plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
plot_lm_2.set_figheight(8)
plot_lm_2.set_figwidth(12)
plot_lm_2.axes[0].set_title('Normal Q-Q')
plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
plot_lm_2.axes[0].set_ylabel('Standardized Residuals');
plot_lm_1 = plt.figure(1)
plot_lm_1.set_figheight(8)
plot_lm_1.set_figwidth(12)
plot_lm_1.axes[0] = sns.residplot(model_fitted_y, 'mpg', data=auto,
lowess=True,
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_1.axes[0].set_title('Residuals vs Fitted')
# fitted values (need a constant term for intercept)
model_fitted_y = model_fit.fittedvalues
# model residuals
model_residuals = model_fit.resid
# normalized residuals
model_norm_residuals = model_fit.get_influence().resid_studentized_internal
# absolute squared normalized residuals
model_f = 'mpg ~ cylinders + \
displacement + \
horsepower + \
weight + \
acceleration + \
year + \
origin'
model = smf.ols(formula=model_f, data=auto)
model_fit = model.fit()
auto = pd.read_csv('Auto.csv', na_values=['?'])
auto.dropna(inplace=True)
auto.reset_index(drop=True, inplace=True)
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.graphics.gofplots import ProbPlot