franvillamil/data.R

## data.R
setwd("...")
library(ggplot2)

## DATA: Ron Francisco's European Protest and Coercion Data.
## Downloaded (.xls files) from http://web.ku.edu/~ronfrand/data/ and converted into .csv with MS Excel.

# Defining main dataframe and specifying country codes
data = data.frame(date = c(), action = c(), freq = c(), country = c())

countrycodes = c("Albania80-88", "Albania89-95", "Austria80-95", "Belgium80-87", "Belgium88-95",
	"Bulgaria80-89", "Bulgaria90-95", "Cyprus80-95", "Denmark80-86", "Finland80-95",
	"France80-83", "France84-86", "France87-89", "France90-92", "France93-95",
	"FRG80-83", "FRG84-86", "FRG87-89", "FRG90-92", "FRG93-95", "Greece80-95",
	"Iceland80-95", "Ireland80-83", "Ireland84-86", "Ireland87-89",
	"Ireland90-92", "Ireland93-95", "Italy80-83", "Italy84-87", "Italy88-91",
	"Italy92-95", "Luxembourg80-95", "Netherlands80-95", "NorthernIreland80-83",
	"NorthernIreland84-86", "NorthernIreland87-89", "NorthernIreland90-92",
	"NorthernIreland93-95", "Norway80-95", "Poland80-81", "Poland82-83", "Poland84-85",
	"Poland86-87", "Poland88-89", "Poland90-92", "Poland93-95", "Portugal80-87",
	"Portugal88-95", "Romania80-95", "Spain80-83", "Spain84-86", "Spain87-89",
	"Spain90-92", "Spain93-95", "Sweden80-95", "Switzerland80-95", "UK80-83",
	"UK84-86", "UK87-89", "UK90-92", "UK93-95")

### MISSING: 	Czechoslovakia & GDR & Hungary (n=24!) (remove all comunists?)

# Reading data from country CSVs
# (NOTE: data files must be in "data" folder inside working directory)
for (j in 1:length(countrycodes)){
	# Getting the file name and reading the CSV
	filename = paste("data/", countrycodes[j], ".csv", sep="")
	i = read.csv(file = filename, header = TRUE)
	# Removing all columns but date and action type
	i = i[,c(1,3)]
	names(i)[c(1,2)] = c("date", "action")
	# Changing action names: demonstrations, strikes, and others
	levels(i$action)[which(levels(i$action) == "demonstrations")] = "demonstration"
	levels(i$action)[which(levels(i$action) == "general strike")] = "strike"
	levels(i$action)[which(levels(i$action) == "strike ")] = "strike"
	levels(i$action)[which(levels(i$action)!= "demonstration" & levels(i$action) != "strike")] = "other"
	# Getting the month from date column (output: chr variable)
	i$date = as.Date(i$date, "%d-%b-%y")
	i$date = format(i$date, "%Y-%m")
	# Getting monthly frequencies
	i = as.data.frame(table(i$date, i$action))
	# Removing "other" events
	i = i[(i$Var2 == "demonstration" | i$Var2 == "strike"),]
	# Changing variables names
	names(i)[1:3] = c("date", "action", "freq")
	# Getting country name from file name and adding a fourth column
	countryname = substr(filename, 6, (nchar(filename)-9))
	i = cbind(i, country = rep(countryname, length(i$date)))
	# Add to main dataframe
	data = rbind(data, i)
	}

	# Removing 2 cases wrong coded in 2066
	data = data[-which(data$date == "2066-02"),]
	# Cleaning workspace and removing "other" level from action variable
	rm(countrycodes, j, i, filename, countryname)
	data$action = factor(data$action)


# Turning date variable into date class for plotting
data$date = as.character(data$date)
data$date = as.Date(x = paste("01-", data$date, sep=""), format = "%d-%Y-%m")

# Plot protest data through time, separating strikes from demonstrations
protest.plot = ggplot(data, aes(x = date, y = freq)) + geom_histogram(stat = "identity", binwidth=1) +
	scale_x_date(breaks = "3 months",
		minor_breaks = "1 month",
		labels = date_format("%b %Y"),
		limits = c(as.Date("1980-07-01"), as.Date("1995-04-01"))) +
	theme(axis.text.x = element_text(angle = 90),
		plot.title = element_text(face="bold")) +
	ylab("") + xlab("") +
	ggtitle("MONTHLY NUMBER OF DEMONSTRATIONS AND STRIKES IN EUROPE, 1980-1995\n
		(Albania, Austria, Belgium, Bulgaria, Cyprus, Denmark, Finland, France, FR Germany, Greece, Iceland, Ireland,
			Italy, Luxembourg, Netherlands, N. Ireland, Norway, Poland, Portugal, Romania, Spain, Sweden, Switzerland, UK)\n") +
	facet_wrap( ~ action, ncol = 1, scales = "free_y")

# Plot dependent variable (monthly freq of demonstrations/strikes)
DV.plot = ggplot(data[data$freq<101,], aes(x = freq)) +
	geom_histogram(binwidth=1) +
	ggtitle("Dependent variable - freq distribution\n(Removed 21 cases above 100 from plot)") +
	xlab("Monthly frequency of demonstrations and strikes in each country")+ylab("")
	setwd("...")
	library(ggplot2)

	## DATA: Ron Francisco's European Protest and Coercion Data.
	## Downloaded (.xls files) from http://web.ku.edu/~ronfrand/data/ and converted into .csv with MS Excel.

	# Defining main dataframe and specifying country codes
	data = data.frame(date = c(), action = c(), freq = c(), country = c())

	countrycodes = c("Albania80-88", "Albania89-95", "Austria80-95", "Belgium80-87", "Belgium88-95",
	"Bulgaria80-89", "Bulgaria90-95", "Cyprus80-95", "Denmark80-86", "Finland80-95",
	"France80-83", "France84-86", "France87-89", "France90-92", "France93-95",
	"FRG80-83", "FRG84-86", "FRG87-89", "FRG90-92", "FRG93-95", "Greece80-95",
	"Iceland80-95", "Ireland80-83", "Ireland84-86", "Ireland87-89",
	"Ireland90-92", "Ireland93-95", "Italy80-83", "Italy84-87", "Italy88-91",
	"Italy92-95", "Luxembourg80-95", "Netherlands80-95", "NorthernIreland80-83",
	"NorthernIreland84-86", "NorthernIreland87-89", "NorthernIreland90-92",
	"NorthernIreland93-95", "Norway80-95", "Poland80-81", "Poland82-83", "Poland84-85",
	"Poland86-87", "Poland88-89", "Poland90-92", "Poland93-95", "Portugal80-87",
	"Portugal88-95", "Romania80-95", "Spain80-83", "Spain84-86", "Spain87-89",
	"Spain90-92", "Spain93-95", "Sweden80-95", "Switzerland80-95", "UK80-83",
	"UK84-86", "UK87-89", "UK90-92", "UK93-95")

	### MISSING: Czechoslovakia & GDR & Hungary (n=24!) (remove all comunists?)

	# Reading data from country CSVs
	# (NOTE: data files must be in "data" folder inside working directory)
	for (j in 1:length(countrycodes)){
	# Getting the file name and reading the CSV
	filename = paste("data/", countrycodes[j], ".csv", sep="")
	i = read.csv(file = filename, header = TRUE)
	# Removing all columns but date and action type
	i = i[,c(1,3)]
	names(i)[c(1,2)] = c("date", "action")
	# Changing action names: demonstrations, strikes, and others
	levels(i$action)[which(levels(i$action) == "demonstrations")] = "demonstration"
	levels(i$action)[which(levels(i$action) == "general strike")] = "strike"
	levels(i$action)[which(levels(i$action) == "strike ")] = "strike"
	levels(i$action)[which(levels(i$action)!= "demonstration" & levels(i$action) != "strike")] = "other"
	# Getting the month from date column (output: chr variable)
	i$date = as.Date(i$date, "%d-%b-%y")
	i$date = format(i$date, "%Y-%m")
	# Getting monthly frequencies
	i = as.data.frame(table(i$date, i$action))
	# Removing "other" events
	i = i[(i$Var2 == "demonstration" \| i$Var2 == "strike"),]
	# Changing variables names
	names(i)[1:3] = c("date", "action", "freq")
	# Getting country name from file name and adding a fourth column
	countryname = substr(filename, 6, (nchar(filename)-9))
	i = cbind(i, country = rep(countryname, length(i$date)))
	# Add to main dataframe
	data = rbind(data, i)
	}

	# Removing 2 cases wrong coded in 2066
	data = data[-which(data$date == "2066-02"),]
	# Cleaning workspace and removing "other" level from action variable
	rm(countrycodes, j, i, filename, countryname)
	data$action = factor(data$action)


	# Turning date variable into date class for plotting
	data$date = as.character(data$date)
	data$date = as.Date(x = paste("01-", data$date, sep=""), format = "%d-%Y-%m")

	# Plot protest data through time, separating strikes from demonstrations
	protest.plot = ggplot(data, aes(x = date, y = freq)) + geom_histogram(stat = "identity", binwidth=1) +
	scale_x_date(breaks = "3 months",
	minor_breaks = "1 month",
	labels = date_format("%b %Y"),
	limits = c(as.Date("1980-07-01"), as.Date("1995-04-01"))) +
	theme(axis.text.x = element_text(angle = 90),
	plot.title = element_text(face="bold")) +
	ylab("") + xlab("") +
	ggtitle("MONTHLY NUMBER OF DEMONSTRATIONS AND STRIKES IN EUROPE, 1980-1995\n
	(Albania, Austria, Belgium, Bulgaria, Cyprus, Denmark, Finland, France, FR Germany, Greece, Iceland, Ireland,
	Italy, Luxembourg, Netherlands, N. Ireland, Norway, Poland, Portugal, Romania, Spain, Sweden, Switzerland, UK)\n") +
	facet_wrap( ~ action, ncol = 1, scales = "free_y")

	# Plot dependent variable (monthly freq of demonstrations/strikes)
	DV.plot = ggplot(data[data$freq<101,], aes(x = freq)) +
	geom_histogram(binwidth=1) +
	ggtitle("Dependent variable - freq distribution\n(Removed 21 cases above 100 from plot)") +
	xlab("Monthly frequency of demonstrations and strikes in each country")+ylab("")