Skip to content

Instantly share code, notes, and snippets.

View sachinsdate's full-sized avatar
💭
Up to my ears in regression modeling

sachinsdate

💭
Up to my ears in regression modeling
View GitHub Profile
@sachinsdate
sachinsdate / instrumental_variables_regression.py
Last active September 29, 2025 20:00
A tutorial on instrumental variables regression using the IV2SLS class of statsmodels
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.api import add_constant
from statsmodels.sandbox.regression.gmm import IV2SLS
#Load the Panel Study of Income Dynamics (PSID) into a Dataframe
df = pd.read_csv('PSID1976.csv', header=0)
@sachinsdate
sachinsdate / bimodal_residuals.py
Created May 30, 2020 12:10
What to do when residuals have a bimodal distribution
import pandas as pd
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.stattools as st
import matplotlib.pyplot as plt
#create a pandas DataFrame for the counts data set
df = pd.read_csv('bike_sharing_dataset_daywise.csv', header=0, parse_dates=['dteday'], infer_datetime_format=True)
@sachinsdate
sachinsdate / timeseries_decomposition.py
Created June 20, 2020 19:11
A deep dive into the algorithm for time series decomposition
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
#construct the date parser
mydateparser = lambda x: pd.datetime.strptime(x, '%d-%m-%y')
#load the data set into a pandas data frame
@sachinsdate
sachinsdate / holt_winters.py
Created July 25, 2020 18:13
Holt-Winters Exponential Smoothing using Python and statsmodels
import pandas as pd
from matplotlib import pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing as HWES
#read the data file. the date column is expected to be in the mm-dd-yyyy format.
df = pd.read_csv('retail_sales_used_car_dealers_us_1992_2020.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
df.index.freq = 'MS'
#plot the data
df.plot()
@sachinsdate
sachinsdate / linear_model_plots.py
Created March 5, 2021 00:44
Create and show a plot of the Mean Model and a Linear Regression Model
import pandas as pd
from matplotlib import pyplot as plt
from statsmodels.regression.linear_model import OLS as OLS
import statsmodels.api as sm
df = pd.read_csv('taiwan_real_estate_valuation_curated.csv', header=0)
y = df['HOUSE_PRICE_PER_UNIT_AREA']
X = df['HOUSE_AGE_YEARS']
@sachinsdate
sachinsdate / DOHMH_Beach_Water_Quality_Data.csv
Created July 8, 2021 10:37
Water quality samples taken from various NYC beaches. Data source: NYC OpenData under their terms of use: https://www1.nyc.gov/home/terms-of-use.page
We can't make this file beautiful and searchable because it's too large.
Sample_ID,Sample_Date,Beach_Name,Sample_Location,Enterococci_Results,Units_or_Notes
050514CP13,05/05/2014,MIDLAND BEACH,Center,20.0,MPN/100 ml
062011GR04,06/20/2011,MANHATTAN BEACH,Left,,Result below detection limit
072808BH09,07/28/2008,MIDLAND BEACH,Right,28.0,MPN/100 ml
051214CP36,05/12/2014,SOUTH BEACH,Right,4.0,MPN/100 ml
081511KB07,08/15/2011,CEDAR GROVE,Left,360.0,MPN/100 ml
062909KB01,06/29/2009,MANHATTAN BEACH,Left,8.0,MPN/100 ml
082112KB07,08/21/2012,CEDAR GROVE,Left,20.0,MPN/100 ml
072015GR06,07/20/2015,MANHATTAN BEACH,Right,,Result below detection limit
082613CP16,08/26/2013,SOUTH BEACH,Center,12.0,MPN/100 ml
@sachinsdate
sachinsdate / interval_estimate.py
Created July 11, 2021 16:31
Calculation of the interval estimate for the population mean at a specified confidence level
import math
import matplotlib.pyplot as plt
from scipy.stats import invweibull
from scipy.stats import norm
import numpy as np
import pandas as pd
#Load the data file
df = pd.read_csv('DOHMH_Beach_Water_Quality_Data.csv', header=0, infer_datetime_format=True, parse_dates=['Sample_Date'])
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from scipy.stats import poisson
from scipy.stats import binom
from patsy import dmatrices
import statsmodels.graphics.tsaplots as tsa
from matplotlib import pyplot as plt
@sachinsdate
sachinsdate / markov_process.py
Created October 26, 2021 17:37
Some important concepts about a Markov process and a simulation of stock price movement using a 2-state Markov process
import numpy as np
from matplotlib import pyplot as plt
import random
import math
#initialize the transition matrix P
P=np.array([[0.6,0.4],[0.75,0.25]])
#initialize pi_0
pi_0=np.array([0.5, 0.5])
@sachinsdate
sachinsdate / markov_switching_dynamic_regression.py
Created November 13, 2021 17:35
A tutorial on Markov Switching Dynamic Regression Model using Python and statsmodels
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
#Load the PCE and UMCSENT datasets
df = pd.read_csv(filepath_or_buffer='UMCSENT_PCE.csv', header=0, index_col=0,
infer_datetime_format=True, parse_dates=['DATE'])
#Set the index frequency to 'Month-Start'
df = df.asfreq('MS')