| ### Starter script for analyzing the Eurobarometer Trend data (1970-2002). | |
| ### Outline: | |
| ### 1. Import the dataset straight from the web | |
| ### 2. Isolate variables of interest and clean them up | |
| ### 3. Estimate a regression model of what makes people turnout for EP elections | |
| ### Download packages we're going to use | |
| install.packages("ggplot2") | |
| install.packages("foreign") | |
| install.packages("Zelig") | |
| library(ggplot2) | |
| library(foreign) | |
| library(Zelig) | |
| ### Define the URL where the dataset is located | |
| url<-"https://dl.dropboxusercontent.com/u/20498362/eurobarometer_trends/eurobarometer_trends.dta?raw=1" | |
| ### Read the dataset straight from the web, name the dataframe "data" | |
| data <- read.dta(url) | |
| ### Variable "mediause" asks how much the respondent relies on the media. | |
| summary(data$mediause) # Get a simple summary | |
| data$mediause[data$mediause=="dk"]<-NA # Let's consider all the "dk" answerers as missing (NA) | |
| data$mediause[data$mediause=="na"]<-NA # Let's consider all the "dk" as missing (NA) | |
| data$mediause[data$mediause=="inap"]<-NA # Let's consider all the "inap" as missing (NA) | |
| data$mediause<-factor(data$mediause) # this removes unused levels | |
| levels(data$mediause)<-c("Very high", "High", "Low", "Very low") # Clean up the messy names of the levels | |
| data$medianum<-as.numeric(data$mediause) # convert the categories to a numerical scale (cheating a bit!) | |
| data$medianum<-(4 - data$medianum) # Subtract from 4 to make it more intuitive (higher number = more likely to vote) | |
| ### Variable "particip" asks respondent how likely they are to vote in the EP elections. | |
| summary(data$particip) | |
| data$particip[data$particip=="DK,NA"]<-NA | |
| data$particip[data$particip=="depends"]<-NA | |
| data$particip[data$particip=="inap"]<-NA | |
| data$particip<-factor(data$particip) | |
| levels(data$particip)<-c("Certainly yes", "Probably yes", "Probably not", "Certainly not") | |
| data$participnum<-as.numeric(data$particip) # convert the categories to a numerical scale (cheating a bit!) | |
| data$participnum<-(4 - data$participnum) # Subtract from 4 to make it more intuitive (higher number = more likely to vote) | |
| data$particip <- ifelse(data$participnum>=2, 1, 0) # Collapse into a no/yes, 0/1, binary version | |
| ### Variable "income" asks the respondent's income level. | |
| summary(data$income) | |
| ### Variable "polint" asks the respondent's political interest generally. | |
| summary(data$polint) | |
| data$polint[data$polint=="DK,NA"]<-NA | |
| data$polint[data$polint=="inap"]<-NA | |
| data$polint<-factor(data$polint) | |
| levels(data$polint)<-c("A great deal", "To some extent", "Not much", "Not at all") | |
| data$polintnum<-as.numeric(data$polint) # convert the categories to a numerical scale (cheating a bit!) | |
| data$polintnum<-(4 - data$polintnum) # Subtract from 4 to make it more intuitive (higher number = more likely to vote) | |
| summary(data$polintnum) | |
| ### Variable "ecint3" asks the respondent's interest in EU politics. | |
| summary(data$ecint3) | |
| data$ecint3[data$ecint3=="DK, NA"]<-NA | |
| data$ecint3[data$ecint3=="inap"]<-NA | |
| data$ecint3<-factor(data$ecint3) | |
| levels(data$ecint3)<-c("Very interested", "A little", "Not at all") | |
| data$ecint3num<-as.numeric(data$ecint3) # convert the categories to a numerical scale (cheating a bit!) | |
| data$ecint3num<-(3 - data$ecint3num) # Subtract from 4 to make it more intuitive (higher number = more likely to vote) | |
| summary(data$ecint3num) | |
| ### Variable "ecint4" asks the respondent's interest in EU politics. | |
| summary(data$ecint4) | |
| data$ecint4[data$ecint4=="DK, NA"]<-NA | |
| data$ecint4[data$ecint4=="inap"]<-NA | |
| data$ecint4<-factor(data$ecint4) | |
| levels(data$ecint4)<-c("A great deal", "To some extent", "Not much", "Not at all") | |
| data$ecint4num<-as.numeric(data$ecint4) # convert the categories to a numerical scale (cheating a bit!) | |
| data$ecint4num<-(4 - data$ecint4num) # Subtract from 4 to make it more intuitive (higher number = more likely to vote) | |
| summary(data$ecint4num) | |
| ### Variable "nation1" captures the respondent's nation | |
| summary(data$nation1) | |
| data$gb<-ifelse(data$nation1=="GREAT BRITAIN", 1, 0) | |
| ### Variable "year" captures the year of the survey | |
| summary(data$year) | |
| ########################################################################## | |
| ### Zero in on the subset of the sample relevant to your research question | |
| ########################################################################## | |
| # Subset the data to one country in one year because having multiple countries/years gets complicated statistically | |
| dataUK1994<-subset(data, nation1=="GREAT BRITAIN" & data$year==1994, select=c("particip", "medianum", "polintnum", "income", "nation1")) | |
| ######################################################################### | |
| ### Always do some basic visual inspection / descriptive analysis of your key variables | |
| qplot(dataUK1994$particip) + | |
| labs(x="Likely to vote in the EU Parliament election?") | |
| qplot(dataUK1994$medianum) + | |
| labs(x="Degree of relying on the media") | |
| ####################################################### | |
| ####### Regression analysis using the Zelig package | |
| ####################################################### | |
| # DV is likely to vote or not likely to vote, i.e. a binary variable. | |
| # So we need a "logit" model | |
| model<-zelig(particip ~ | |
| income + medianum, | |
| data=dataUK1994, | |
| model="logit") | |
| summary(model) | |
| x.out <- setx(model, medianum=seq(0,3,1)) # Set media to a range from its min to its max | |
| s.out <- sim(model, x = x.out) | |
| ci.plot(s.out, | |
| main="Effect of Media on Probability of Voting in the European Election, UK in 1994") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment