Last active
October 25, 2019 11:11
-
-
Save nilshg/0c609b9b76d970825e2a3afcc602e211 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CSV, DataFrames, Econometrics, GLM, Statistics, TableView | |
# Read in the data - unbalanced panel | |
# Data from https://rdrr.io/cran/panelvar/man/abdata.html | |
df = CSV.read("abdata.csv", copycols = true) | |
# We start by estimating standard OLS on our data, running the model | |
# | |
# yᵢₜ = α₁Lyᵢₜ + α₂L²yᵢₜ + β₁wᵢₜ + β₂Lwᵢₜ + β₃kᵢₜ + β₄Lkᵢₜ + β₅L²kᵢₜ + β₆ysᵢₜ + β₇Lysᵢₜ + β₈L²ysᵢₜ | |
# + ∑yearₜ | |
# | |
# As the model includes two lags of dependent and independent variables, only observations | |
# from t=3 are included in the model, as the second lag is unavailable for 1976 and 1977. | |
# We therefore need to drop 1976 and 1977, as they only include zeros once t=1,2 are removed, | |
# and 1978 given that it would be collinear with 1979-1984 dummies | |
m1 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + | |
yr1979 + yr1980 + yr1981 + yr1982 + yr1983 + yr1984), df) | |
# Alternatively, we can include year as a categorical variable | |
df.year_s = string.(df.year) | |
m2 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year_s), df) | |
f = @formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year_s) | |
# This turns out to be the same thing: | |
coef(m1) ≈ coef(m2) | |
# We therefore find | |
## Purging fixed effects | |
# One immediate issue with the above is that the error term is likely to contain a fixed | |
# effect, which is correlated with Lyᵢₜ - to see this consider a firm that experiences a | |
# large large negative shock that isn't modelled in 1980, which will be absorbed in the | |
# error term. The fixed effect - the average unexplained deviation from average - for 1976 | |
# to 86 will then appear lower, but in 1981 lagged employment will also be lower, creating | |
# a positive correlation and inflating the OLS estimator. | |
# The first, most obvious way of addressing this is to include firm dummies in the regression, | |
# or have a categorical firm regressor: | |
df.firm_cat = string.(df.id) | |
m3 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year + firm_cat), | |
df | |
) | |
# Our estimate has reduced from 1.043 to 0.733. We could have achieved the same thing with | |
# a within tranformation: | |
fit(EconometricModel, | |
@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + | |
yr1979 + yr1980 + yr1981 + yr1982 + yr1983 + yr1984 + absorb(id) | |
), | |
df | |
) | |
# However, this within transformation does not eliminate the dynamic panel bias, as shown | |
# by Nickell (1981). Two tansformations are commonly used to get around this: the first- | |
# difference transformation, Δyᵢₜ = αΔLyᵢₜ + Δxᵢₜ'β + Δvᵢₜ. This will leave ΔLyᵢₜ correlated | |
# with Δvᵢₜ (through Lyᵢₜ in ΔLyᵢₜ and Lvᵢₜ in Δvᵢₜ), but further lags of yᵢₜ are available | |
# as instruments and will be uncorrelated. This transform however means that for each missing | |
# observation, two Δ observations are lost, severly exacerbating problems of missing data. | |
# To get around this, Arellano and Bover (1995) developed the "forward orthogonal deviations" | |
# transform, which subtracts from each observation the average of all future realisations, | |
# and adds a scaling factor to equalise the variances. | |
# The transform is given by: | |
# | |
# uᵢₜ* = cₜ (uᵢₜ - 1/(T-t) ∑_(s>t)uᵢₛ) | |
# | |
# and implemented by the following method: | |
function fod_transform(var, id, df) | |
df[!, Symbol(:fw_, var)] = allowmissing(zeros(nrow(df))) | |
for i in unique(df[!, id]) | |
vv = df[df[!, id] .== i, var] | |
tmax = length(vv) | |
fod = fill!(allowmissing(zeros(tmax)), missing) | |
for t ∈ 1:tmax-1 | |
cᵢₜ = √((tmax - t)/(tmax - t + 1)) | |
fod[t+1] = cᵢₜ * (vv[t] - mean(vv[t+1:end])) # store one period late | |
end | |
df[df[!, id] .== i, Symbol(:fw_, var)] = fod | |
end | |
return df | |
end | |
fod_transform.([:n, :w, :k, :ys], :id, Ref(df)) | |
# Note that this method currently assumes that data is sorted by years within group. | |
# The xtabond2 paper stores the transformed variables one period late, and claims this | |
# is a "software convention" - I'm not sure on this but have implemented it this way. | |
## Establishing a panel structure | |
# As the following estimators make use of lagged and differenced regressors, it is helpful | |
# to establish a panel structure. For this, it is generally a good idea to sort the data | |
# frame by group indicator and time within group: | |
sort!(df, [:id, :year]) | |
# Next, we want indicators that tell us whether a row is for the same group as the previous | |
# row and the time increment is 1 - if this is not the case, we shouldn't be constructing | |
# first differences using that row and the previous one! | |
df.year_offset_1 = (df.year .- lag(df.year)) .== 1; df.year_offset_1[1] = false | |
df.same = df.id .== lag(df.id); df.same[1] = false | |
# We can then construct a bunch of differences variables. We use the group and time check | |
# to insert missing values where a row is not for the same group one year on from the previous | |
# row | |
for var in [:n, :nL1, :nL2, :w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2, :yr1976, :yr1977, | |
:yr1978, :yr1979, :yr1980, :yr1981, :yr1982, :yr1983, :yr1984] | |
df[!, Symbol(:Δ, var)] = df[!, var] .- lag(df[!, var]) | |
df[(df.same .== false) .| (df.year_offset_1 .== false), Symbol(:Δ, var)] .= missing | |
end | |
# With this in hand, we can performa the Anderson-Hsiao (1981) levels estimator, a 2SLS | |
# estimator that instruments ΔLyᵢₜ with L²yᵢₜ. In Econometrics.jl, this is implemented as | |
# an EconometricModel which includes (ΔnL1 ~ nL2) in the @formula term | |
m3 = fit(EconometricModel, | |
@formula(Δn ~ (ΔnL1 ~ nL2) + ΔnL2 + Δw + ΔwL1 + Δk + ΔkL1 + ΔkL2 + Δys + ΔysL1 + ΔysL2 + | |
Δyr1979 + Δyr1980 + Δyr1981 + Δyr1982 + Δyr1983), | |
df, | |
panel = :id, | |
time = :year | |
) | |
# Alternatively, we can build up the formula programatically | |
Δfy(x) = Symbol(:Δ, x) | |
f = Term(:Δn) ~ (Term(:ΔnL1) ~ Term(:nL2)) + | |
sum( [Term.(Δfy.([:nL2, :w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2])); | |
[Term(Symbol(:Δyr, string(i))) for i in 1979:1983]] | |
) | |
fit(EconometricModel, f, df, panel = :id, time = :year) | |
# The above is equivalent to the Stata command | |
# ivreg D.n (D.nL1 = nL2) D.(nL2 w wL1 k kL1 kL2 ys ysL1 ysL2 yr1979 ... yr1983) | |
# | |
# While this estimate is consistent, it is well outside the 0.7 - 1.05 range established | |
# earlier. To improve efficiency, it would be possible to include deeper lags of the | |
# dependent variable as additional instruments, which improves efficiency if it introduces | |
# more information. This however creates a sample size issue, as further lags mean losing | |
# additional years of data. To get around this, Holtz-Eakin, Newey, and Rosen (1988) develop | |
# different way to create instruments. | |
# They use the twice lag instrument of the dependent variable to construct instruments, | |
# substituting zeros for missing values: | |
yᵢ = df[df.id .== 1, :n] | |
Δyᵢ = diff(yᵢ) | |
Zᵢ = zeros(6, 5) | |
for i ∈ 1:5 | |
Zᵢ[i+1, i] = lag(yᵢ, 2)[i+2] | |
end | |
# We can procude these instruments in our data set as follows: | |
for yr = minimum(df.year):maximum(df.year) | |
for l = 2:yr-minimum(df.year) | |
c = Symbol("z", string(yr), "L", string(l)) | |
df[!, c] .= 0.0 | |
for r = 1:size(df, 2) | |
if r-l > 0 | |
if df[r, :id] == df[r-l, :id] && df[r, :year] == df[r+l, :year] - l && df[r, :year] == yr | |
df[r, c] = df[r-l, :n] | |
end | |
end | |
end | |
end | |
end | |
# Then construct an array of terms of the formula | |
z = [Term(i) for i in names(df[:, r"z"])] | |
# And construct the formula itself - this is a difference model with the lag instrumented | |
# using the Holtz-Eakin et al. instruments | |
f = Term(:Δn) ~ (ConstantTerm(0) | |
+ sum( [Term.(Δfy.([:w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2])); | |
[Term(Symbol(:Δyr, string(i))) for i in 1979:1983]] ) | |
+ (Term(:ΔnL1) ~ sum(z) ) ) | |
fit(EconometricModel, f, df, panel = :id, time = :year) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment