nilshg/Dynamic Panels.jl

## Dynamic Panels.jl
using CSV, DataFrames, Econometrics, GLM, Statistics, TableView

# Read in the data - unbalanced panel
# Data from https://rdrr.io/cran/panelvar/man/abdata.html
df = CSV.read("abdata.csv", copycols = true)

# We start by estimating standard OLS on our data, running the model
#
# yᵢₜ = α₁Lyᵢₜ + α₂L²yᵢₜ + β₁wᵢₜ + β₂Lwᵢₜ + β₃kᵢₜ + β₄Lkᵢₜ + β₅L²kᵢₜ + β₆ysᵢₜ + β₇Lysᵢₜ + β₈L²ysᵢₜ
#        + ∑yearₜ
#
# As the model includes two lags of dependent and independent variables, only observations
# from t=3 are included in the model, as the second lag is unavailable for 1976 and 1977.
# We therefore need to drop 1976 and 1977, as they only include zeros once t=1,2 are removed,
# and 1978 given that it would be collinear with 1979-1984 dummies

m1 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 +
            yr1979 + yr1980 + yr1981 + yr1982 + yr1983 + yr1984), df)

# Alternatively, we can include year as a categorical variable
df.year_s = string.(df.year)
m2 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year_s), df)

f = @formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year_s)

# This turns out to be the same thing:
coef(m1) ≈ coef(m2)

# We therefore find

## Purging fixed effects
# One immediate issue with the above is that the error term is likely to contain a fixed
# effect, which is correlated with Lyᵢₜ - to see this consider a firm that experiences a
# large large negative shock that isn't modelled in 1980, which will be absorbed in the
# error term. The fixed effect - the average unexplained deviation from average - for 1976
# to 86 will then appear lower, but in 1981 lagged employment will also be lower, creating
# a positive correlation and inflating the OLS estimator.
# The first, most obvious way of addressing this is to include firm dummies in the regression,
# or have a categorical firm regressor:

df.firm_cat = string.(df.id)
m3 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year + firm_cat),
    df
)

# Our estimate has reduced from 1.043 to 0.733. We could have achieved the same thing with
# a within tranformation:

fit(EconometricModel,
    @formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 +
        yr1979 + yr1980 + yr1981 + yr1982 + yr1983 + yr1984 + absorb(id)
    ),
    df
)

# However, this within transformation does not eliminate the dynamic panel bias, as shown
# by Nickell (1981). Two tansformations are commonly used to get around this: the first-
# difference transformation, Δyᵢₜ = αΔLyᵢₜ + Δxᵢₜ'β + Δvᵢₜ. This will leave ΔLyᵢₜ correlated
# with Δvᵢₜ (through Lyᵢₜ in ΔLyᵢₜ and Lvᵢₜ in Δvᵢₜ), but further lags of yᵢₜ are available
# as instruments and will be uncorrelated. This transform however means that for each missing
# observation, two Δ observations are lost, severly exacerbating problems of missing data.
# To get around this, Arellano and Bover (1995) developed the "forward orthogonal deviations"
# transform, which subtracts from each observation the average of all future realisations,
# and adds a scaling factor to equalise the variances.
# The transform is given by:
#
# uᵢₜ* = cₜ (uᵢₜ - 1/(T-t) ∑_(s>t)uᵢₛ)
#
# and implemented by the following method:

function fod_transform(var, id, df)
    df[!, Symbol(:fw_, var)] = allowmissing(zeros(nrow(df)))
    for i in unique(df[!, id])
        vv = df[df[!, id] .== i, var]
        tmax = length(vv)
        fod = fill!(allowmissing(zeros(tmax)), missing)
        for t ∈ 1:tmax-1
            cᵢₜ = √((tmax - t)/(tmax - t + 1))
            fod[t+1] = cᵢₜ * (vv[t] - mean(vv[t+1:end])) # store one period late
        end
        df[df[!, id] .== i, Symbol(:fw_, var)] = fod
    end
    return df
end

fod_transform.([:n, :w, :k, :ys], :id, Ref(df))

# Note that this method currently assumes that data is sorted by years within group.
# The xtabond2 paper stores the transformed variables one period late, and claims this
# is a "software convention" - I'm not sure on this but have implemented it this way.

## Establishing a panel structure
# As the following estimators make use of lagged and differenced regressors, it is helpful
# to establish a panel structure. For this, it is generally a good idea to sort the data
# frame by group indicator and time within group:
sort!(df, [:id, :year])

# Next, we want indicators that tell us whether a row is for the same group as the previous
# row and the time increment is 1 - if this is not the case, we shouldn't be constructing
# first differences using that row and the previous one!
df.year_offset_1 = (df.year .- lag(df.year)) .== 1; df.year_offset_1[1] = false
df.same = df.id .== lag(df.id); df.same[1] = false

# We can then construct a bunch of differences variables. We use the group and time check
# to insert missing values where a row is not for the same group one year on from the previous
# row
for var in [:n, :nL1, :nL2, :w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2, :yr1976, :yr1977,
            :yr1978, :yr1979, :yr1980, :yr1981, :yr1982, :yr1983, :yr1984]
    df[!, Symbol(:Δ, var)] = df[!, var] .- lag(df[!, var])
    df[(df.same .== false) .| (df.year_offset_1 .== false), Symbol(:Δ, var)] .= missing
end

# With this in hand, we can performa the Anderson-Hsiao (1981) levels estimator, a 2SLS
# estimator that instruments ΔLyᵢₜ with L²yᵢₜ. In Econometrics.jl, this is implemented as
# an EconometricModel which includes (ΔnL1 ~ nL2) in the @formula term

m3 = fit(EconometricModel,
    @formula(Δn ~ (ΔnL1 ~ nL2) + ΔnL2 + Δw + ΔwL1 + Δk + ΔkL1 + ΔkL2 + Δys + ΔysL1 + ΔysL2 +
            Δyr1979 + Δyr1980 + Δyr1981 + Δyr1982 + Δyr1983),
    df,
    panel = :id,
    time = :year
)

# Alternatively, we can build up the formula programatically
Δfy(x) = Symbol(:Δ, x)

f = Term(:Δn) ~ (Term(:ΔnL1) ~ Term(:nL2)) +
                sum( [Term.(Δfy.([:nL2, :w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2]));
                     [Term(Symbol(:Δyr, string(i))) for i in 1979:1983]]
                )

fit(EconometricModel, f, df, panel = :id, time = :year)

# The above is equivalent to the Stata command
# ivreg D.n (D.nL1 = nL2) D.(nL2 w wL1 k kL1 kL2 ys ysL1 ysL2 yr1979 ... yr1983)
#
# While this estimate is consistent, it is well outside the 0.7 - 1.05 range established
# earlier. To improve efficiency, it would be possible to include deeper lags of the
# dependent variable as additional instruments, which improves efficiency if it introduces
# more information. This however creates a sample size issue, as further lags mean losing
# additional years of data. To get around this, Holtz-Eakin, Newey, and Rosen (1988) develop
# different way to create instruments.
# They use the twice lag instrument of the dependent variable to construct instruments,
# substituting zeros for missing values:

yᵢ = df[df.id .== 1, :n]
Δyᵢ = diff(yᵢ)

Zᵢ = zeros(6, 5)

for i ∈ 1:5
    Zᵢ[i+1, i] = lag(yᵢ, 2)[i+2]
end

# We can procude these instruments in our data set as follows:

for yr = minimum(df.year):maximum(df.year)
    for l = 2:yr-minimum(df.year)
        c = Symbol("z", string(yr), "L", string(l))
        df[!, c] .= 0.0
        for r = 1:size(df, 2)
            if r-l > 0
                if df[r, :id] == df[r-l, :id] && df[r, :year] == df[r+l, :year] - l && df[r, :year] == yr
                    df[r, c] = df[r-l, :n]
                end
            end
        end
    end
end

# Then construct an array of terms of the formula
z = [Term(i) for i in names(df[:, r"z"])]

# And construct the formula itself - this is a difference model with the lag instrumented
# using the Holtz-Eakin et al. instruments
f = Term(:Δn) ~ (ConstantTerm(0)
              + sum( [Term.(Δfy.([:w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2]));
                     [Term(Symbol(:Δyr, string(i))) for i in 1979:1983]] )
              + (Term(:ΔnL1) ~ sum(z) ) )

fit(EconometricModel, f, df, panel = :id, time = :year)
	using CSV, DataFrames, Econometrics, GLM, Statistics, TableView

	# Read in the data - unbalanced panel
	# Data from https://rdrr.io/cran/panelvar/man/abdata.html
	df = CSV.read("abdata.csv", copycols = true)

	# We start by estimating standard OLS on our data, running the model
	#
	# yᵢₜ = α₁Lyᵢₜ + α₂L²yᵢₜ + β₁wᵢₜ + β₂Lwᵢₜ + β₃kᵢₜ + β₄Lkᵢₜ + β₅L²kᵢₜ + β₆ysᵢₜ + β₇Lysᵢₜ + β₈L²ysᵢₜ
	# + ∑yearₜ
	#
	# As the model includes two lags of dependent and independent variables, only observations
	# from t=3 are included in the model, as the second lag is unavailable for 1976 and 1977.
	# We therefore need to drop 1976 and 1977, as they only include zeros once t=1,2 are removed,
	# and 1978 given that it would be collinear with 1979-1984 dummies

	m1 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 +
	yr1979 + yr1980 + yr1981 + yr1982 + yr1983 + yr1984), df)

	# Alternatively, we can include year as a categorical variable
	df.year_s = string.(df.year)
	m2 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year_s), df)

	f = @formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year_s)

	# This turns out to be the same thing:
	coef(m1) ≈ coef(m2)

	# We therefore find

	## Purging fixed effects
	# One immediate issue with the above is that the error term is likely to contain a fixed
	# effect, which is correlated with Lyᵢₜ - to see this consider a firm that experiences a
	# large large negative shock that isn't modelled in 1980, which will be absorbed in the
	# error term. The fixed effect - the average unexplained deviation from average - for 1976
	# to 86 will then appear lower, but in 1981 lagged employment will also be lower, creating
	# a positive correlation and inflating the OLS estimator.
	# The first, most obvious way of addressing this is to include firm dummies in the regression,
	# or have a categorical firm regressor:

	df.firm_cat = string.(df.id)
	m3 = lm(@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 + year + firm_cat),
	df
	)

	# Our estimate has reduced from 1.043 to 0.733. We could have achieved the same thing with
	# a within tranformation:

	fit(EconometricModel,
	@formula(n ~ nL1 + nL2 + w + wL1 + k + kL1 + kL2 + ys + ysL1 + ysL2 +
	yr1979 + yr1980 + yr1981 + yr1982 + yr1983 + yr1984 + absorb(id)
	),
	df
	)

	# However, this within transformation does not eliminate the dynamic panel bias, as shown
	# by Nickell (1981). Two tansformations are commonly used to get around this: the first-
	# difference transformation, Δyᵢₜ = αΔLyᵢₜ + Δxᵢₜ'β + Δvᵢₜ. This will leave ΔLyᵢₜ correlated
	# with Δvᵢₜ (through Lyᵢₜ in ΔLyᵢₜ and Lvᵢₜ in Δvᵢₜ), but further lags of yᵢₜ are available
	# as instruments and will be uncorrelated. This transform however means that for each missing
	# observation, two Δ observations are lost, severly exacerbating problems of missing data.
	# To get around this, Arellano and Bover (1995) developed the "forward orthogonal deviations"
	# transform, which subtracts from each observation the average of all future realisations,
	# and adds a scaling factor to equalise the variances.
	# The transform is given by:
	#
	# uᵢₜ* = cₜ (uᵢₜ - 1/(T-t) ∑_(s>t)uᵢₛ)
	#
	# and implemented by the following method:

	function fod_transform(var, id, df)
	df[!, Symbol(:fw_, var)] = allowmissing(zeros(nrow(df)))
	for i in unique(df[!, id])
	vv = df[df[!, id] .== i, var]
	tmax = length(vv)
	fod = fill!(allowmissing(zeros(tmax)), missing)
	for t ∈ 1:tmax-1
	cᵢₜ = √((tmax - t)/(tmax - t + 1))
	fod[t+1] = cᵢₜ * (vv[t] - mean(vv[t+1:end])) # store one period late
	end
	df[df[!, id] .== i, Symbol(:fw_, var)] = fod
	end
	return df
	end

	fod_transform.([:n, :w, :k, :ys], :id, Ref(df))

	# Note that this method currently assumes that data is sorted by years within group.
	# The xtabond2 paper stores the transformed variables one period late, and claims this
	# is a "software convention" - I'm not sure on this but have implemented it this way.

	## Establishing a panel structure
	# As the following estimators make use of lagged and differenced regressors, it is helpful
	# to establish a panel structure. For this, it is generally a good idea to sort the data
	# frame by group indicator and time within group:
	sort!(df, [:id, :year])

	# Next, we want indicators that tell us whether a row is for the same group as the previous
	# row and the time increment is 1 - if this is not the case, we shouldn't be constructing
	# first differences using that row and the previous one!
	df.year_offset_1 = (df.year .- lag(df.year)) .== 1; df.year_offset_1[1] = false
	df.same = df.id .== lag(df.id); df.same[1] = false

	# We can then construct a bunch of differences variables. We use the group and time check
	# to insert missing values where a row is not for the same group one year on from the previous
	# row
	for var in [:n, :nL1, :nL2, :w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2, :yr1976, :yr1977,
	:yr1978, :yr1979, :yr1980, :yr1981, :yr1982, :yr1983, :yr1984]
	df[!, Symbol(:Δ, var)] = df[!, var] .- lag(df[!, var])
	df[(df.same .== false) .\| (df.year_offset_1 .== false), Symbol(:Δ, var)] .= missing
	end

	# With this in hand, we can performa the Anderson-Hsiao (1981) levels estimator, a 2SLS
	# estimator that instruments ΔLyᵢₜ with L²yᵢₜ. In Econometrics.jl, this is implemented as
	# an EconometricModel which includes (ΔnL1 ~ nL2) in the @formula term

	m3 = fit(EconometricModel,
	@formula(Δn ~ (ΔnL1 ~ nL2) + ΔnL2 + Δw + ΔwL1 + Δk + ΔkL1 + ΔkL2 + Δys + ΔysL1 + ΔysL2 +
	Δyr1979 + Δyr1980 + Δyr1981 + Δyr1982 + Δyr1983),
	df,
	panel = :id,
	time = :year
	)

	# Alternatively, we can build up the formula programatically
	Δfy(x) = Symbol(:Δ, x)

	f = Term(:Δn) ~ (Term(:ΔnL1) ~ Term(:nL2)) +
	sum( [Term.(Δfy.([:nL2, :w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2]));
	[Term(Symbol(:Δyr, string(i))) for i in 1979:1983]]
	)

	fit(EconometricModel, f, df, panel = :id, time = :year)

	# The above is equivalent to the Stata command
	# ivreg D.n (D.nL1 = nL2) D.(nL2 w wL1 k kL1 kL2 ys ysL1 ysL2 yr1979 ... yr1983)
	#
	# While this estimate is consistent, it is well outside the 0.7 - 1.05 range established
	# earlier. To improve efficiency, it would be possible to include deeper lags of the
	# dependent variable as additional instruments, which improves efficiency if it introduces
	# more information. This however creates a sample size issue, as further lags mean losing
	# additional years of data. To get around this, Holtz-Eakin, Newey, and Rosen (1988) develop
	# different way to create instruments.
	# They use the twice lag instrument of the dependent variable to construct instruments,
	# substituting zeros for missing values:

	yᵢ = df[df.id .== 1, :n]
	Δyᵢ = diff(yᵢ)

	Zᵢ = zeros(6, 5)

	for i ∈ 1:5
	Zᵢ[i+1, i] = lag(yᵢ, 2)[i+2]
	end

	# We can procude these instruments in our data set as follows:

	for yr = minimum(df.year):maximum(df.year)
	for l = 2:yr-minimum(df.year)
	c = Symbol("z", string(yr), "L", string(l))
	df[!, c] .= 0.0
	for r = 1:size(df, 2)
	if r-l > 0
	if df[r, :id] == df[r-l, :id] && df[r, :year] == df[r+l, :year] - l && df[r, :year] == yr
	df[r, c] = df[r-l, :n]
	end
	end
	end
	end
	end

	# Then construct an array of terms of the formula
	z = [Term(i) for i in names(df[:, r"z"])]

	# And construct the formula itself - this is a difference model with the lag instrumented
	# using the Holtz-Eakin et al. instruments
	f = Term(:Δn) ~ (ConstantTerm(0)
	+ sum( [Term.(Δfy.([:w, :wL1, :k, :kL1, :kL2, :ys, :ysL1, :ysL2]));
	[Term(Symbol(:Δyr, string(i))) for i in 1979:1983]] )
	+ (Term(:ΔnL1) ~ sum(z) ) )

	fit(EconometricModel, f, df, panel = :id, time = :year)