Created
December 5, 2012 02:54
-
-
Save michelleboisson/4211707 to your computer and use it in GitHub Desktop.
Data Without Borders Final Exploration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Final Data Exploration | |
#Grameen Foundation, CKWs | |
ix.data = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/search_logs.csv", header=TRUE, as.is=TRUE) | |
head(ix.data) | |
dim(ix.data) | |
#[1] 334718 11 | |
ckws = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/ckw.csv", header=TRUE, as.is=TRUE) | |
head(ckws) | |
dim(ckws) | |
#[1] 896 20 | |
names(ix.data) | |
#[1] "Interviewer..Person.ID" "Interviewer..Gender" "Interviewer..District.Name" | |
#[4] "Interviewee" "Handset.Submit.Time" "Server.Entry.Time" | |
#[7] "Submission.Latitude" "Submission.Longitude" "Category" | |
#[10] "Query" "Response" | |
names(ckws) | |
#[1] "Person.ID" | |
#[2] "CKW.ID" | |
#[3] "Gender" | |
#[4] "District" | |
#[5] "Education.Level" | |
#[6] "Active.Date" | |
#[7] "CKW.Status" | |
#[8] "CKW.Type" | |
#[9] "Poverty.Status" | |
#[10] "Previous.Month.s.Performance..Start.Date" | |
#[11] "Previous.Month.s.Performance..Searches..Target" | |
#[12] "Previous.Month.s.Performance..Searches..Total.Valid" | |
#[13] "Previous.Month.s.Performance..Searches..Invalid" | |
#[14] "Previous.Month.s.Performance..Searches..New.System..Updated.Daily." | |
#[15] "Previous.Month.s.Performance..Surveys..Target" | |
#[16] "Previous.Month.s.Performance..Surveys..Total.Valid" | |
#[17] "Previous.Month.s.Performance..Surveys..Approved" | |
#[18] "Previous.Month.s.Performance..Surveys..Rejected" | |
#[19] "Previous.Month.s.Performance..Surveys..Pending" | |
#[20] "Previous.Month.s.Performance..Surveys..Not.Reviewed" | |
#convert times to seconds | |
submit.time <- strptime(ix.data$Handset.Submit.Time, format="%m/%d/%y %H:%M") | |
ix.data$Handset.Submit.Seconds = submit.time | |
server.time <- strptime(ix.data$Server.Entry.Time, format="%m/%d/%y %H:%M") | |
ix.data$Server.Entry.Seconds = server.time | |
#now I have extra columns that I can plot over time | |
names(ix.data) | |
#[1] "Interviewer..Person.ID" "Interviewer..Gender" "Interviewer..District.Name" | |
#[4] "Interviewee" "Handset.Submit.Time" "Server.Entry.Time" | |
#[7] "Submission.Latitude" "Submission.Longitude" "Category" | |
#[10] "Query" "Response" "Handset.Submit.Seconds" | |
#[13] "Server.Entry.Seconds" | |
#Server Queries over time. There seems to be a steady growth in the number of queries reaching the server. | |
#I do see a really, large abnormal spike | |
hist(as.numeric(as.POSIXlt(ix.data$Server.Entry.Seconds)), breaks=500, main="Sever Queries Over Time") | |
#however the handset submissions are only happening later | |
hist(as.numeric(as.POSIXlt(ix.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time") | |
#let's remove the NAs | |
clean.times = which(is.na(ix.data$Handset.Submit.Seconds) == FALSE) | |
clean.times.handsets = ix.data[clean.times, ] | |
clean.times = which(is.na(clean.times.handsets$Server.Entry.Seconds) == FALSE) | |
ix.data.cleaned = clean.times.handsets[clean.times, ] | |
#When I plot this: | |
hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time") | |
#I'm still seeing some irregularity | |
min(ix.data.cleaned$Handset.Submit.Seconds) | |
#[1] "1980-01-05 17:05:00 EST" | |
min(ix.data.cleaned$Server.Entry.Seconds) | |
#[1] "2010-10-01 21:52:00 EDT" | |
length(which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) < as.numeric(min(ix.data.cleaned$Server.Entry.Seconds)))) | |
#[1] 1626 | |
#1626 entries where the earliest handset entry is less than the earliest server entry | |
#and that difference is of 30 years... I'm not sure what's happening here | |
max(ix.data.cleaned$Server.Entry.Seconds) | |
#[1] "2011-11-03 21:20:00 EDT" | |
max(ix.data.cleaned$Handset.Submit.Seconds) | |
#[1] "2011-11-15 18:36:00 EST" | |
#so the latest dates fall a little closer | |
#for simplicity I'm going to use the date range from the Server Entry times | |
#and ignore the 1626 entries that fall out that | |
ix.data.cleaned = ix.data.cleaned[which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) > as.numeric(min(ix.data.cleaned$Server.Entry.Seconds))), ] | |
dim(ix.data.cleaned) | |
#[1] 332548 13 | |
#plot again | |
hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time") | |
#that looks much better | |
#Let's see where the bikes were introduced | |
bike.start = "2011-09-02 00:00:00 EST" | |
bike.end = "2011-09-03 00:00:00 EST" | |
abline(v= as.numeric(as.POSIXlt(bike.start), col="red")) | |
abline(v= as.numeric(as.POSIXlt(bike.end), col="red")) | |
#continuing my exploration of the data, let's map these | |
library(maps) | |
map("world", xlim = range(ix.data$Submission.Latitude), ylim = range(ix.data$Submission.Longitude)) | |
points(ix.data$Submission.Longitude, ix.data$Submission.Longitude, col=2, pch=19, cex=0.8) | |
#see map | |
#What about distribution of Districts | |
district.table = table(ix.data.cleaned$Interviewer..District.Name) | |
plot(district.table) | |
#Categories of Search | |
rev(sort(table(ix.data.cleaned$Category))) | |
#Crops Market Information Animals Content Not Found | |
#81564 70812 60201 46304 | |
#Regional Weather Info Farm Inputs MobileMoney Directory | |
#36739 18405 8673 5530 | |
#Service Providers Cultures Vivrières | |
#4092 228 | |
#Number of interviewers | |
length(unique(ix.data.cleaned$Interviewer..Person.ID)) | |
#[1] 872 | |
#Let's focus on the areas where the bikes were introduced: | |
#Gulu, Amuru, Nwoya | |
unique(ix.data.cleaned$Interviewer..District.Name) | |
#[1] "Kitgum" "Pader" "Masindi" "Mbale" "Amuru" "Kapchorwa" | |
#[7] "Kasese" "Bushenyi" "Gulu" "Oyam" "Nwoya" "Mukono" | |
#[13] "Luwero Nakaseke" "Agago" "" | |
bike.areas.data = ix.data.cleaned[ix.data.cleaned$Interviewer..District.Name == 'Gulu' | ix.data.cleaned$Interviewer..District.Name == 'Nwoya' | ix.data.cleaned$Interviewer..District.Name == 'Amuru', ] | |
dim(bike.areas.data) | |
#[1] 48988 13 | |
unique(bike.areas.data$Interviewer..District.Name) | |
#[1] "Amuru" "Gulu" "Nwoya" | |
#ok let's plot this | |
hist(as.numeric(as.POSIXlt(bike.areas.data$Server.Entry.Seconds)), breaks=500, main="Server Submissions in Areas where Bikes were Distributed") | |
abline(v= as.numeric(as.POSIXlt(bike.start), col="red")) | |
abline(v= as.numeric(as.POSIXlt(bike.end), col="red")) | |
#Ok, now I'm curious about this spike I keep seeing in the Server.Entry.Seconds | |
max(ix.data.cleaned$Server.Entry.Seconds) | |
#[1] "2011-11-03 21:20:00 EDT" | |
ix.data.cleaned[which(ix.data.cleaned$Server.Entry.Seconds == max(ix.data.cleaned$Server.Entry.Seconds)), ] | |
#So all of these are from the same interviewer, in the same place | |
hist(as.numeric(as.POSIXlt(bike.areas.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions in Areas where Bikes were Distributed") | |
abline(v= as.numeric(as.POSIXlt(bike.start), col="red")) | |
abline(v= as.numeric(as.POSIXlt(bike.end), col="red")) | |
#The Plan: | |
#Look at average visits per day per ckw before | |
#and after bicycles were introduced | |
#I'm going to use Handset.Submit.Time | |
#to find a day, I will add a column to each interaction indicating | |
#the unique day since 1/1/1900 = years since 1900 * 365 + day of the year | |
#add the new day for each column | |
for (i in 1:nrow(bike.areas.data)) { | |
thisDate = bike.areas.data[i, "Handset.Submit.Seconds"] | |
thisUniqueDay = as.POSIXlt(thisDate)$year * 365 + as.POSIXlt(thisDate)$yday | |
bike.areas.data[i, "day"] = thisUniqueDay | |
} | |
#this function took forever! There's probably a better way to do this... but it worked! | |
max(bike.areas.data$day) - min(bike.areas.data$day) | |
#[1] 405 | |
#we have data over 405 days | |
#subsetting the data | |
sub.ckws.days = bike.areas.data[,c("Interviewer..Person.ID", "day")] | |
names(sub.ckws.days) = c("person", "day") #simplifying names | |
#how about we get only the people who started before this data was collected | |
#convert Active.Date to a proper timestamp I can work with | |
Active.Date.Proper = strptime(ckws$Active.Date, format="%m/%d/%y") | |
ckws$Active.Date.Proper = Active.Date.Proper | |
#for simplicity I will use the Sept 2nd 011 as the day for bikes | |
#(instead of both the 2nd and 3rd) | |
#merge the data sets | |
exchanges.ckws <- merge(sub.ckws.days, ckws, by.y = "Person.ID", by.x="Interviewer..Person.ID")) | |
old.Gs = exchanges.ckws[which(exchanges.ckws$Active.Date.Proper >= min(exchanges.ckws$day)), c("person", "day", "District", "Active.Date.Proper")] | |
dim(old.Gs) | |
#[1] 48896 4 | |
#turns out the ones we got rid of are NAs, not that the started after this dataset was created. | |
length(which(is.na(exchanges.ckws$Active.Date.Proper))) | |
#[1] 213 | |
#renaming | |
biking.ckws = old.Gs | |
length(unique(biking.ckws$person)) | |
#[1] 101 | |
#Let's try with one person | |
Person.000983.Data = biking.ckws[which(biking.ckws$person == "Person-000983"),] | |
Person.000983.Data | |
table(Person.000983.Data$day) | |
plot(table(Person.000983.Data$day)) | |
#convert bike.start to my crazy unique day formula | |
bike.start.uniqueday = as.POSIXlt(bike.start)$year * 365 + as.POSIXlt(bike.start)$yday | |
bike.start.uniqueday | |
#[1] 40759 | |
abline(v=bike.start.uniqueday, col="red") | |
#all of them | |
plot(table(biking.ckws$day), main="CKWs' Activity in Bike Areas") | |
abline(v=bike.start.uniqueday, col="red") | |
#or this maybe | |
h = hist(biking.ckws$day, breaks=500) | |
#Let's get into a Time Series | |
a <- acf(biking.ckws$day, lag.max=30) #since we're dealing with days, I tried a max of a month | |
#there's a strong chance of a frequency every 6 days | |
days <- ts(h$counts, frequency=6) | |
plot(parts) | |
abline(v=bike.start.uniqueday, col="red") #this didn't work | |
#there still seems to be a cycle in the trend | |
# I tried 30 days | |
days <- ts(h$counts, frequency=30) | |
parts <- decompose(days) | |
plot(parts) | |
abline(v=bike.start.uniqueday, col="red") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment