jmrphy/gss_2010.R

## gss_2010.R
############################################################
### 1. Download and install the packages you want to use ###
############################################################

### Packages are just collections of code written by other R users to simplify common
### tasks. For instance, R is very good at reading all kinds of data files.
### There is a package called "foreign" that lets you read in various data files
### with hardly any effort. We need it here because we have a .dta datafile.
### In the future, when you need to install a certain package, use this code but
### replace "foreign" in both lines with the name of your package.

install.packages("foreign")   #download and install the package called "foreign"
                              #from CRAN, the home of R on the web
library(foreign)   #load the package "foreign" in order to use it!

### Note that, in future sessions, you don't have to run line 13 again--once you
### install a package, you only have to load it with the library() function.

############################################################
### 2. Load data into R ####################################
############################################################

### Now we're going to use functions in the "foreign" package to load in our "foreign"
### data. Here's the code you need to load in the GSS data file (if you've downloaded it
### eand set your working directory to where that data file exists on your hard drive!)
### In the future, when you want to load a different data file, you can use this code
### but adjust it.

df<-read.dta("gss2010.dta")  # read the .dta file called "gss2010.dta" and assign it a
                             # dataframe called "df". You might get some warnings
                             # in your console, but that's OK. You're good so long as
                             # you now see in your Workspace an object called "df"
                             # with 4901 observations and 1223 variables.

############################################################
### 2. Check out and "clean" the data  #####################
############################################################

# Scanning the codebook, one of the 1223 variables you'll find is "pres08" which records,
# for every person surveyed, who that person voted for in the 2008 election. So if we
# are interested in why certain people vote in certain ways--we can use this variable
# to explore what kinds of people are more or less likely to vote for Obama or McCain.
# But first we need to check what the variable actually contains.
# To do this, first find it in the codebook. Then take a look at this variable
# in the data!

summary(df$pres08) # give a summary of the variable "pres08" in the dataframe "df"

# The summary shows that this variable contains useless information that we're not
# interested in. For instance, it says 19 people didn't vote and there are a bunch of
# categories that do not describe anyone in the survey! Data "cleaning" refers to
# the common tasks of manipulating these sorts of things to make your variables
# reflect exactly what you're interested in. If you don't clean your data,
# your analyses will be completely wrong because they'll be calculating nonsense.

# So let's just remove those who didn't vote from our consideration. To do this, we can
# just assign missing values to those people. R uses the two capital letters "NA" to
# denote missing values. So...

####################################################################
### 2A. Assign missing values (NA) to values you want to ignore ####
####################################################################

df$pres08[df$pres08=="didn't vote"]<-NA  # in the variable "pres08" in the dataframe
                                         # called "df", assign the value of "NA". The
                                         # square brackets are there to specifiy "but
                                         # only exactly where that variable
                                         # is equal to "didn't vote".
summary(df$pres08)   # re-examine it.

# Now notice that the category "didn't vote" has 0 observations. Nice.

# Finally, to get rid of the categories with nobody in them, use the
# factor() function on the variable and assign it to itself.

df$pres08<-factor(df$pres08)

summary(df$pres08)   # re-examine it.

# Now notice that you only have the levels of interest! Perfect.

####################################################################
### 2B. Recode variables ###########################################
####################################################################

# Sometimes you want to fundamentally restructure your variables
# to better suit a particular question you have. Remember that you
# have full creative control over how you want to shape, group,
# and name different features of empirical data. For instance,
# perhaps a variable currently distinguishes several different
# but you want to simplify them into 2 more general categories.

# Consider the variable "wrkstat", which distinguishes several different
# kinds of work status.

summary(df$wrkstat)

# But maybe we're only interested in trying to figure out the differences
# between the employed and unemployed. In which case, we want
# a variable like this one but with only one distinction: between
# the unemployed and anyone else employed in any job. To do this,
# we can use the "memisc" package to make a new variable that recodes
# every category except "unemployed" as one category called  "employed".

install.packages("memisc") #install the package "memisc", really good for recoding
library(memisc)   #load the package so you can use its functions

df$unemployed <- df$wrkstat  #make replica of "wrkstat" called "unemployed",
                             #both in the dataframe called "df"

df$unemployed <- recode(df$unemployed, "unemployed" <- "unempl, laid off",
                        "NA"<-NA,
                       otherwise = "not unemployed")

# recode the variable "unemployed" so that the category "unempl, laid off"
# will now be called "unemployed", and any other category will now be put in
# one category called "other".

summary(df$unemployed)
	############################################################
	### 1. Download and install the packages you want to use ###
	############################################################

	### Packages are just collections of code written by other R users to simplify common
	### tasks. For instance, R is very good at reading all kinds of data files.
	### There is a package called "foreign" that lets you read in various data files
	### with hardly any effort. We need it here because we have a .dta datafile.
	### In the future, when you need to install a certain package, use this code but
	### replace "foreign" in both lines with the name of your package.

	install.packages("foreign") #download and install the package called "foreign"
	#from CRAN, the home of R on the web
	library(foreign) #load the package "foreign" in order to use it!

	### Note that, in future sessions, you don't have to run line 13 again--once you
	### install a package, you only have to load it with the library() function.

	############################################################
	### 2. Load data into R ####################################
	############################################################

	### Now we're going to use functions in the "foreign" package to load in our "foreign"
	### data. Here's the code you need to load in the GSS data file (if you've downloaded it
	### eand set your working directory to where that data file exists on your hard drive!)
	### In the future, when you want to load a different data file, you can use this code
	### but adjust it.

	df<-read.dta("gss2010.dta") # read the .dta file called "gss2010.dta" and assign it a
	# dataframe called "df". You might get some warnings
	# in your console, but that's OK. You're good so long as
	# you now see in your Workspace an object called "df"
	# with 4901 observations and 1223 variables.

	############################################################
	### 2. Check out and "clean" the data #####################
	############################################################

	# Scanning the codebook, one of the 1223 variables you'll find is "pres08" which records,
	# for every person surveyed, who that person voted for in the 2008 election. So if we
	# are interested in why certain people vote in certain ways--we can use this variable
	# to explore what kinds of people are more or less likely to vote for Obama or McCain.
	# But first we need to check what the variable actually contains.
	# To do this, first find it in the codebook. Then take a look at this variable
	# in the data!

	summary(df$pres08) # give a summary of the variable "pres08" in the dataframe "df"

	# The summary shows that this variable contains useless information that we're not
	# interested in. For instance, it says 19 people didn't vote and there are a bunch of
	# categories that do not describe anyone in the survey! Data "cleaning" refers to
	# the common tasks of manipulating these sorts of things to make your variables
	# reflect exactly what you're interested in. If you don't clean your data,
	# your analyses will be completely wrong because they'll be calculating nonsense.

	# So let's just remove those who didn't vote from our consideration. To do this, we can
	# just assign missing values to those people. R uses the two capital letters "NA" to
	# denote missing values. So...

	####################################################################
	### 2A. Assign missing values (NA) to values you want to ignore ####
	####################################################################

	df$pres08[df$pres08=="didn't vote"]<-NA # in the variable "pres08" in the dataframe
	# called "df", assign the value of "NA". The
	# square brackets are there to specifiy "but
	# only exactly where that variable
	# is equal to "didn't vote".
	summary(df$pres08) # re-examine it.

	# Now notice that the category "didn't vote" has 0 observations. Nice.

	# Finally, to get rid of the categories with nobody in them, use the
	# factor() function on the variable and assign it to itself.

	df$pres08<-factor(df$pres08)

	summary(df$pres08) # re-examine it.

	# Now notice that you only have the levels of interest! Perfect.

	####################################################################
	### 2B. Recode variables ###########################################
	####################################################################

	# Sometimes you want to fundamentally restructure your variables
	# to better suit a particular question you have. Remember that you
	# have full creative control over how you want to shape, group,
	# and name different features of empirical data. For instance,
	# perhaps a variable currently distinguishes several different
	# but you want to simplify them into 2 more general categories.

	# Consider the variable "wrkstat", which distinguishes several different
	# kinds of work status.

	summary(df$wrkstat)

	# But maybe we're only interested in trying to figure out the differences
	# between the employed and unemployed. In which case, we want
	# a variable like this one but with only one distinction: between
	# the unemployed and anyone else employed in any job. To do this,
	# we can use the "memisc" package to make a new variable that recodes
	# every category except "unemployed" as one category called "employed".

	install.packages("memisc") #install the package "memisc", really good for recoding
	library(memisc) #load the package so you can use its functions

	df$unemployed <- df$wrkstat #make replica of "wrkstat" called "unemployed",
	#both in the dataframe called "df"

	df$unemployed <- recode(df$unemployed, "unemployed" <- "unempl, laid off",
	"NA"<-NA,
	otherwise = "not unemployed")

	# recode the variable "unemployed" so that the category "unempl, laid off"
	# will now be called "unemployed", and any other category will now be put in
	# one category called "other".

	summary(df$unemployed)