michelleboisson/Grameen-Foundation.R

## Grameen-Foundation.R
#Final Data Exploration
#Grameen Foundation, CKWs

ix.data = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/search_logs.csv", header=TRUE, as.is=TRUE)
head(ix.data)
dim(ix.data)
#[1] 334718     11

ckws = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/ckw.csv", header=TRUE, as.is=TRUE)
head(ckws)
dim(ckws)
#[1] 896  20

names(ix.data)
#[1] "Interviewer..Person.ID"     "Interviewer..Gender"        "Interviewer..District.Name"
#[4] "Interviewee"                "Handset.Submit.Time"        "Server.Entry.Time"
#[7] "Submission.Latitude"        "Submission.Longitude"       "Category"
#[10] "Query"                      "Response"

names(ckws)
#[1] "Person.ID"
#[2] "CKW.ID"
#[3] "Gender"
#[4] "District"
#[5] "Education.Level"
#[6] "Active.Date"
#[7] "CKW.Status"
#[8] "CKW.Type"
#[9] "Poverty.Status"
#[10] "Previous.Month.s.Performance..Start.Date"
#[11] "Previous.Month.s.Performance..Searches..Target"
#[12] "Previous.Month.s.Performance..Searches..Total.Valid"
#[13] "Previous.Month.s.Performance..Searches..Invalid"
#[14] "Previous.Month.s.Performance..Searches..New.System..Updated.Daily."
#[15] "Previous.Month.s.Performance..Surveys..Target"
#[16] "Previous.Month.s.Performance..Surveys..Total.Valid"
#[17] "Previous.Month.s.Performance..Surveys..Approved"
#[18] "Previous.Month.s.Performance..Surveys..Rejected"
#[19] "Previous.Month.s.Performance..Surveys..Pending"
#[20] "Previous.Month.s.Performance..Surveys..Not.Reviewed"


#convert times to seconds
submit.time <- strptime(ix.data$Handset.Submit.Time, format="%m/%d/%y %H:%M")
ix.data$Handset.Submit.Seconds = submit.time
server.time <- strptime(ix.data$Server.Entry.Time, format="%m/%d/%y %H:%M")
ix.data$Server.Entry.Seconds = server.time

#now I have extra columns that I can plot over time
names(ix.data)
#[1] "Interviewer..Person.ID"     "Interviewer..Gender"        "Interviewer..District.Name"
#[4] "Interviewee"                "Handset.Submit.Time"        "Server.Entry.Time"
#[7] "Submission.Latitude"        "Submission.Longitude"       "Category"
#[10] "Query"                      "Response"                   "Handset.Submit.Seconds"
#[13] "Server.Entry.Seconds"

#Server Queries over time. There seems to be a steady growth in the number of queries reaching the server.
#I do see a really, large abnormal spike
hist(as.numeric(as.POSIXlt(ix.data$Server.Entry.Seconds)), breaks=500, main="Sever Queries Over Time")

#however the handset submissions are only happening later
hist(as.numeric(as.POSIXlt(ix.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
#let's remove the NAs
clean.times = which(is.na(ix.data$Handset.Submit.Seconds) == FALSE)
clean.times.handsets = ix.data[clean.times, ]
clean.times = which(is.na(clean.times.handsets$Server.Entry.Seconds) == FALSE)
ix.data.cleaned = clean.times.handsets[clean.times, ]

#When I plot this:
hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
#I'm still seeing some irregularity

min(ix.data.cleaned$Handset.Submit.Seconds)
#[1] "1980-01-05 17:05:00 EST"
min(ix.data.cleaned$Server.Entry.Seconds)
#[1] "2010-10-01 21:52:00 EDT"

length(which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) < as.numeric(min(ix.data.cleaned$Server.Entry.Seconds))))
#[1] 1626
#1626 entries where the earliest handset entry is less than the earliest server entry
#and that difference is of 30 years... I'm not sure what's happening here

max(ix.data.cleaned$Server.Entry.Seconds)
#[1] "2011-11-03 21:20:00 EDT"
max(ix.data.cleaned$Handset.Submit.Seconds)
#[1] "2011-11-15 18:36:00 EST"

#so the latest dates fall a little closer
#for simplicity I'm going to use the date range from the Server Entry times
#and ignore the 1626 entries that fall out that
ix.data.cleaned = ix.data.cleaned[which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) > as.numeric(min(ix.data.cleaned$Server.Entry.Seconds))), ]
dim(ix.data.cleaned)
#[1] 332548     13

#plot again
hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
#that looks much better

#Let's see where the bikes were introduced
bike.start = "2011-09-02 00:00:00 EST"
bike.end = "2011-09-03 00:00:00 EST"
abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))

#continuing my exploration of the data, let's map these
library(maps)
map("world", xlim = range(ix.data$Submission.Latitude), ylim = range(ix.data$Submission.Longitude))
points(ix.data$Submission.Longitude, ix.data$Submission.Longitude, col=2, pch=19, cex=0.8)
#see map

#What about distribution of Districts
district.table = table(ix.data.cleaned$Interviewer..District.Name)
plot(district.table)

#Categories of Search
rev(sort(table(ix.data.cleaned$Category)))
#Crops    Market Information               Animals     Content Not Found
#81564                 70812                 60201                 46304
#Regional Weather Info           Farm Inputs MobileMoney Directory
#36739                 18405                  8673                  5530
#Service Providers    Cultures Vivrières
#4092                   228

#Number of interviewers
length(unique(ix.data.cleaned$Interviewer..Person.ID))
#[1] 872


#Let's focus on the areas where the bikes were introduced:
#Gulu, Amuru, Nwoya
unique(ix.data.cleaned$Interviewer..District.Name)
#[1] "Kitgum"          "Pader"           "Masindi"         "Mbale"           "Amuru"           "Kapchorwa"
#[7] "Kasese"          "Bushenyi"        "Gulu"            "Oyam"            "Nwoya"           "Mukono"
#[13] "Luwero Nakaseke" "Agago"           ""
bike.areas.data = ix.data.cleaned[ix.data.cleaned$Interviewer..District.Name == 'Gulu' | ix.data.cleaned$Interviewer..District.Name == 'Nwoya' | ix.data.cleaned$Interviewer..District.Name == 'Amuru', ]
dim(bike.areas.data)
#[1] 48988    13
unique(bike.areas.data$Interviewer..District.Name)
#[1] "Amuru" "Gulu"  "Nwoya"

#ok let's plot this
hist(as.numeric(as.POSIXlt(bike.areas.data$Server.Entry.Seconds)), breaks=500, main="Server Submissions in Areas where Bikes were Distributed")
abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))

#Ok, now I'm curious about this spike I keep seeing in the Server.Entry.Seconds
max(ix.data.cleaned$Server.Entry.Seconds)
#[1] "2011-11-03 21:20:00 EDT"
ix.data.cleaned[which(ix.data.cleaned$Server.Entry.Seconds == max(ix.data.cleaned$Server.Entry.Seconds)), ]
#So all of these are from the same interviewer, in the same place


hist(as.numeric(as.POSIXlt(bike.areas.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions in Areas where Bikes were Distributed")
abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))

#The Plan:
#Look at average visits per day per ckw before
#and after bicycles were introduced

#I'm going to use Handset.Submit.Time
#to find a day, I will add a column to each interaction indicating
#the unique day since 1/1/1900 = years since 1900 * 365 + day of the year

#add the new day for each column
for (i in 1:nrow(bike.areas.data))  {
  thisDate = bike.areas.data[i, "Handset.Submit.Seconds"]
  thisUniqueDay = as.POSIXlt(thisDate)$year * 365 + as.POSIXlt(thisDate)$yday

  bike.areas.data[i, "day"] = thisUniqueDay
}
#this function took forever! There's probably a better way to do this... but it worked!
max(bike.areas.data$day) - min(bike.areas.data$day)
#[1] 405
#we have data over 405 days

#subsetting the data
sub.ckws.days = bike.areas.data[,c("Interviewer..Person.ID", "day")]
names(sub.ckws.days) = c("person", "day") #simplifying names

#how about we get only the people who started before this data was collected
#convert Active.Date to a proper timestamp I can work with
Active.Date.Proper = strptime(ckws$Active.Date, format="%m/%d/%y")
ckws$Active.Date.Proper = Active.Date.Proper
#for simplicity I will use the Sept 2nd 011 as the day for bikes
#(instead of both the 2nd and 3rd)

#merge the data sets
exchanges.ckws <- merge(sub.ckws.days, ckws, by.y = "Person.ID", by.x="Interviewer..Person.ID"))
old.Gs = exchanges.ckws[which(exchanges.ckws$Active.Date.Proper >= min(exchanges.ckws$day)), c("person", "day", "District", "Active.Date.Proper")]
dim(old.Gs)
#[1] 48896     4
#turns out the ones we got rid of are NAs, not that the started after this dataset was created.
length(which(is.na(exchanges.ckws$Active.Date.Proper)))
#[1] 213

#renaming
biking.ckws = old.Gs
length(unique(biking.ckws$person))
#[1] 101

#Let's try with one person
Person.000983.Data = biking.ckws[which(biking.ckws$person == "Person-000983"),]
Person.000983.Data
table(Person.000983.Data$day)
plot(table(Person.000983.Data$day))

#convert bike.start to my crazy unique day formula
bike.start.uniqueday = as.POSIXlt(bike.start)$year * 365 + as.POSIXlt(bike.start)$yday
bike.start.uniqueday
#[1] 40759

abline(v=bike.start.uniqueday, col="red")

#all of them
plot(table(biking.ckws$day), main="CKWs' Activity in Bike Areas")
abline(v=bike.start.uniqueday, col="red")

#or this maybe
h = hist(biking.ckws$day, breaks=500)

#Let's get into a Time Series
a <- acf(biking.ckws$day, lag.max=30) #since we're dealing with days, I tried a max of a month

#there's a strong chance of a frequency every 6 days
days <- ts(h$counts, frequency=6)
plot(parts)
abline(v=bike.start.uniqueday, col="red") #this didn't work
#there still seems to be a cycle in the trend
# I tried 30 days

days <- ts(h$counts, frequency=30)
parts <- decompose(days)
plot(parts)
abline(v=bike.start.uniqueday, col="red")
	#Final Data Exploration
	#Grameen Foundation, CKWs

	ix.data = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/search_logs.csv", header=TRUE, as.is=TRUE)
	head(ix.data)
	dim(ix.data)
	#[1] 334718 11

	ckws = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/ckw.csv", header=TRUE, as.is=TRUE)
	head(ckws)
	dim(ckws)
	#[1] 896 20

	names(ix.data)
	#[1] "Interviewer..Person.ID" "Interviewer..Gender" "Interviewer..District.Name"
	#[4] "Interviewee" "Handset.Submit.Time" "Server.Entry.Time"
	#[7] "Submission.Latitude" "Submission.Longitude" "Category"
	#[10] "Query" "Response"

	names(ckws)
	#[1] "Person.ID"
	#[2] "CKW.ID"
	#[3] "Gender"
	#[4] "District"
	#[5] "Education.Level"
	#[6] "Active.Date"
	#[7] "CKW.Status"
	#[8] "CKW.Type"
	#[9] "Poverty.Status"
	#[10] "Previous.Month.s.Performance..Start.Date"
	#[11] "Previous.Month.s.Performance..Searches..Target"
	#[12] "Previous.Month.s.Performance..Searches..Total.Valid"
	#[13] "Previous.Month.s.Performance..Searches..Invalid"
	#[14] "Previous.Month.s.Performance..Searches..New.System..Updated.Daily."
	#[15] "Previous.Month.s.Performance..Surveys..Target"
	#[16] "Previous.Month.s.Performance..Surveys..Total.Valid"
	#[17] "Previous.Month.s.Performance..Surveys..Approved"
	#[18] "Previous.Month.s.Performance..Surveys..Rejected"
	#[19] "Previous.Month.s.Performance..Surveys..Pending"
	#[20] "Previous.Month.s.Performance..Surveys..Not.Reviewed"


	#convert times to seconds
	submit.time <- strptime(ix.data$Handset.Submit.Time, format="%m/%d/%y %H:%M")
	ix.data$Handset.Submit.Seconds = submit.time
	server.time <- strptime(ix.data$Server.Entry.Time, format="%m/%d/%y %H:%M")
	ix.data$Server.Entry.Seconds = server.time

	#now I have extra columns that I can plot over time
	names(ix.data)
	#[1] "Interviewer..Person.ID" "Interviewer..Gender" "Interviewer..District.Name"
	#[4] "Interviewee" "Handset.Submit.Time" "Server.Entry.Time"
	#[7] "Submission.Latitude" "Submission.Longitude" "Category"
	#[10] "Query" "Response" "Handset.Submit.Seconds"
	#[13] "Server.Entry.Seconds"

	#Server Queries over time. There seems to be a steady growth in the number of queries reaching the server.
	#I do see a really, large abnormal spike
	hist(as.numeric(as.POSIXlt(ix.data$Server.Entry.Seconds)), breaks=500, main="Sever Queries Over Time")

	#however the handset submissions are only happening later
	hist(as.numeric(as.POSIXlt(ix.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
	#let's remove the NAs
	clean.times = which(is.na(ix.data$Handset.Submit.Seconds) == FALSE)
	clean.times.handsets = ix.data[clean.times, ]
	clean.times = which(is.na(clean.times.handsets$Server.Entry.Seconds) == FALSE)
	ix.data.cleaned = clean.times.handsets[clean.times, ]

	#When I plot this:
	hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
	#I'm still seeing some irregularity

	min(ix.data.cleaned$Handset.Submit.Seconds)
	#[1] "1980-01-05 17:05:00 EST"
	min(ix.data.cleaned$Server.Entry.Seconds)
	#[1] "2010-10-01 21:52:00 EDT"

	length(which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) < as.numeric(min(ix.data.cleaned$Server.Entry.Seconds))))
	#[1] 1626
	#1626 entries where the earliest handset entry is less than the earliest server entry
	#and that difference is of 30 years... I'm not sure what's happening here

	max(ix.data.cleaned$Server.Entry.Seconds)
	#[1] "2011-11-03 21:20:00 EDT"
	max(ix.data.cleaned$Handset.Submit.Seconds)
	#[1] "2011-11-15 18:36:00 EST"

	#so the latest dates fall a little closer
	#for simplicity I'm going to use the date range from the Server Entry times
	#and ignore the 1626 entries that fall out that
	ix.data.cleaned = ix.data.cleaned[which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) > as.numeric(min(ix.data.cleaned$Server.Entry.Seconds))), ]
	dim(ix.data.cleaned)
	#[1] 332548 13

	#plot again
	hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
	#that looks much better

	#Let's see where the bikes were introduced
	bike.start = "2011-09-02 00:00:00 EST"
	bike.end = "2011-09-03 00:00:00 EST"
	abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
	abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))

	#continuing my exploration of the data, let's map these
	library(maps)
	map("world", xlim = range(ix.data$Submission.Latitude), ylim = range(ix.data$Submission.Longitude))
	points(ix.data$Submission.Longitude, ix.data$Submission.Longitude, col=2, pch=19, cex=0.8)
	#see map

	#What about distribution of Districts
	district.table = table(ix.data.cleaned$Interviewer..District.Name)
	plot(district.table)

	#Categories of Search
	rev(sort(table(ix.data.cleaned$Category)))
	#Crops Market Information Animals Content Not Found
	#81564 70812 60201 46304
	#Regional Weather Info Farm Inputs MobileMoney Directory
	#36739 18405 8673 5530
	#Service Providers Cultures Vivrières
	#4092 228

	#Number of interviewers
	length(unique(ix.data.cleaned$Interviewer..Person.ID))
	#[1] 872


	#Let's focus on the areas where the bikes were introduced:
	#Gulu, Amuru, Nwoya
	unique(ix.data.cleaned$Interviewer..District.Name)
	#[1] "Kitgum" "Pader" "Masindi" "Mbale" "Amuru" "Kapchorwa"
	#[7] "Kasese" "Bushenyi" "Gulu" "Oyam" "Nwoya" "Mukono"
	#[13] "Luwero Nakaseke" "Agago" ""
	bike.areas.data = ix.data.cleaned[ix.data.cleaned$Interviewer..District.Name == 'Gulu' \| ix.data.cleaned$Interviewer..District.Name == 'Nwoya' \| ix.data.cleaned$Interviewer..District.Name == 'Amuru', ]
	dim(bike.areas.data)
	#[1] 48988 13
	unique(bike.areas.data$Interviewer..District.Name)
	#[1] "Amuru" "Gulu" "Nwoya"

	#ok let's plot this
	hist(as.numeric(as.POSIXlt(bike.areas.data$Server.Entry.Seconds)), breaks=500, main="Server Submissions in Areas where Bikes were Distributed")
	abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
	abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))

	#Ok, now I'm curious about this spike I keep seeing in the Server.Entry.Seconds
	max(ix.data.cleaned$Server.Entry.Seconds)
	#[1] "2011-11-03 21:20:00 EDT"
	ix.data.cleaned[which(ix.data.cleaned$Server.Entry.Seconds == max(ix.data.cleaned$Server.Entry.Seconds)), ]
	#So all of these are from the same interviewer, in the same place


	hist(as.numeric(as.POSIXlt(bike.areas.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions in Areas where Bikes were Distributed")
	abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
	abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))

	#The Plan:
	#Look at average visits per day per ckw before
	#and after bicycles were introduced

	#I'm going to use Handset.Submit.Time
	#to find a day, I will add a column to each interaction indicating
	#the unique day since 1/1/1900 = years since 1900 * 365 + day of the year

	#add the new day for each column
	for (i in 1:nrow(bike.areas.data)) {
	thisDate = bike.areas.data[i, "Handset.Submit.Seconds"]
	thisUniqueDay = as.POSIXlt(thisDate)$year * 365 + as.POSIXlt(thisDate)$yday

	bike.areas.data[i, "day"] = thisUniqueDay
	}
	#this function took forever! There's probably a better way to do this... but it worked!
	max(bike.areas.data$day) - min(bike.areas.data$day)
	#[1] 405
	#we have data over 405 days

	#subsetting the data
	sub.ckws.days = bike.areas.data[,c("Interviewer..Person.ID", "day")]
	names(sub.ckws.days) = c("person", "day") #simplifying names

	#how about we get only the people who started before this data was collected
	#convert Active.Date to a proper timestamp I can work with
	Active.Date.Proper = strptime(ckws$Active.Date, format="%m/%d/%y")
	ckws$Active.Date.Proper = Active.Date.Proper
	#for simplicity I will use the Sept 2nd 011 as the day for bikes
	#(instead of both the 2nd and 3rd)

	#merge the data sets
	exchanges.ckws <- merge(sub.ckws.days, ckws, by.y = "Person.ID", by.x="Interviewer..Person.ID"))
	old.Gs = exchanges.ckws[which(exchanges.ckws$Active.Date.Proper >= min(exchanges.ckws$day)), c("person", "day", "District", "Active.Date.Proper")]
	dim(old.Gs)
	#[1] 48896 4
	#turns out the ones we got rid of are NAs, not that the started after this dataset was created.
	length(which(is.na(exchanges.ckws$Active.Date.Proper)))
	#[1] 213

	#renaming
	biking.ckws = old.Gs
	length(unique(biking.ckws$person))
	#[1] 101

	#Let's try with one person
	Person.000983.Data = biking.ckws[which(biking.ckws$person == "Person-000983"),]
	Person.000983.Data
	table(Person.000983.Data$day)
	plot(table(Person.000983.Data$day))

	#convert bike.start to my crazy unique day formula
	bike.start.uniqueday = as.POSIXlt(bike.start)$year * 365 + as.POSIXlt(bike.start)$yday
	bike.start.uniqueday
	#[1] 40759

	abline(v=bike.start.uniqueday, col="red")

	#all of them
	plot(table(biking.ckws$day), main="CKWs' Activity in Bike Areas")
	abline(v=bike.start.uniqueday, col="red")

	#or this maybe
	h = hist(biking.ckws$day, breaks=500)

	#Let's get into a Time Series
	a <- acf(biking.ckws$day, lag.max=30) #since we're dealing with days, I tried a max of a month

	#there's a strong chance of a frequency every 6 days
	days <- ts(h$counts, frequency=6)
	plot(parts)
	abline(v=bike.start.uniqueday, col="red") #this didn't work
	#there still seems to be a cycle in the trend
	# I tried 30 days

	days <- ts(h$counts, frequency=30)
	parts <- decompose(days)
	plot(parts)
	abline(v=bike.start.uniqueday, col="red")