Skip to content

Instantly share code, notes, and snippets.

@michelleboisson
Created December 5, 2012 02:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michelleboisson/4211707 to your computer and use it in GitHub Desktop.
Save michelleboisson/4211707 to your computer and use it in GitHub Desktop.
Data Without Borders Final Exploration
#Final Data Exploration
#Grameen Foundation, CKWs
ix.data = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/search_logs.csv", header=TRUE, as.is=TRUE)
head(ix.data)
dim(ix.data)
#[1] 334718 11
ckws = read.csv("/Users/michelleboisson/Documents/ITP/* Data without Borders/final/ckw.csv", header=TRUE, as.is=TRUE)
head(ckws)
dim(ckws)
#[1] 896 20
names(ix.data)
#[1] "Interviewer..Person.ID" "Interviewer..Gender" "Interviewer..District.Name"
#[4] "Interviewee" "Handset.Submit.Time" "Server.Entry.Time"
#[7] "Submission.Latitude" "Submission.Longitude" "Category"
#[10] "Query" "Response"
names(ckws)
#[1] "Person.ID"
#[2] "CKW.ID"
#[3] "Gender"
#[4] "District"
#[5] "Education.Level"
#[6] "Active.Date"
#[7] "CKW.Status"
#[8] "CKW.Type"
#[9] "Poverty.Status"
#[10] "Previous.Month.s.Performance..Start.Date"
#[11] "Previous.Month.s.Performance..Searches..Target"
#[12] "Previous.Month.s.Performance..Searches..Total.Valid"
#[13] "Previous.Month.s.Performance..Searches..Invalid"
#[14] "Previous.Month.s.Performance..Searches..New.System..Updated.Daily."
#[15] "Previous.Month.s.Performance..Surveys..Target"
#[16] "Previous.Month.s.Performance..Surveys..Total.Valid"
#[17] "Previous.Month.s.Performance..Surveys..Approved"
#[18] "Previous.Month.s.Performance..Surveys..Rejected"
#[19] "Previous.Month.s.Performance..Surveys..Pending"
#[20] "Previous.Month.s.Performance..Surveys..Not.Reviewed"
#convert times to seconds
submit.time <- strptime(ix.data$Handset.Submit.Time, format="%m/%d/%y %H:%M")
ix.data$Handset.Submit.Seconds = submit.time
server.time <- strptime(ix.data$Server.Entry.Time, format="%m/%d/%y %H:%M")
ix.data$Server.Entry.Seconds = server.time
#now I have extra columns that I can plot over time
names(ix.data)
#[1] "Interviewer..Person.ID" "Interviewer..Gender" "Interviewer..District.Name"
#[4] "Interviewee" "Handset.Submit.Time" "Server.Entry.Time"
#[7] "Submission.Latitude" "Submission.Longitude" "Category"
#[10] "Query" "Response" "Handset.Submit.Seconds"
#[13] "Server.Entry.Seconds"
#Server Queries over time. There seems to be a steady growth in the number of queries reaching the server.
#I do see a really, large abnormal spike
hist(as.numeric(as.POSIXlt(ix.data$Server.Entry.Seconds)), breaks=500, main="Sever Queries Over Time")
#however the handset submissions are only happening later
hist(as.numeric(as.POSIXlt(ix.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
#let's remove the NAs
clean.times = which(is.na(ix.data$Handset.Submit.Seconds) == FALSE)
clean.times.handsets = ix.data[clean.times, ]
clean.times = which(is.na(clean.times.handsets$Server.Entry.Seconds) == FALSE)
ix.data.cleaned = clean.times.handsets[clean.times, ]
#When I plot this:
hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
#I'm still seeing some irregularity
min(ix.data.cleaned$Handset.Submit.Seconds)
#[1] "1980-01-05 17:05:00 EST"
min(ix.data.cleaned$Server.Entry.Seconds)
#[1] "2010-10-01 21:52:00 EDT"
length(which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) < as.numeric(min(ix.data.cleaned$Server.Entry.Seconds))))
#[1] 1626
#1626 entries where the earliest handset entry is less than the earliest server entry
#and that difference is of 30 years... I'm not sure what's happening here
max(ix.data.cleaned$Server.Entry.Seconds)
#[1] "2011-11-03 21:20:00 EDT"
max(ix.data.cleaned$Handset.Submit.Seconds)
#[1] "2011-11-15 18:36:00 EST"
#so the latest dates fall a little closer
#for simplicity I'm going to use the date range from the Server Entry times
#and ignore the 1626 entries that fall out that
ix.data.cleaned = ix.data.cleaned[which(as.numeric(ix.data.cleaned$Handset.Submit.Seconds) > as.numeric(min(ix.data.cleaned$Server.Entry.Seconds))), ]
dim(ix.data.cleaned)
#[1] 332548 13
#plot again
hist(as.numeric(as.POSIXlt(ix.data.cleaned$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions Over Time")
#that looks much better
#Let's see where the bikes were introduced
bike.start = "2011-09-02 00:00:00 EST"
bike.end = "2011-09-03 00:00:00 EST"
abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))
#continuing my exploration of the data, let's map these
library(maps)
map("world", xlim = range(ix.data$Submission.Latitude), ylim = range(ix.data$Submission.Longitude))
points(ix.data$Submission.Longitude, ix.data$Submission.Longitude, col=2, pch=19, cex=0.8)
#see map
#What about distribution of Districts
district.table = table(ix.data.cleaned$Interviewer..District.Name)
plot(district.table)
#Categories of Search
rev(sort(table(ix.data.cleaned$Category)))
#Crops Market Information Animals Content Not Found
#81564 70812 60201 46304
#Regional Weather Info Farm Inputs MobileMoney Directory
#36739 18405 8673 5530
#Service Providers Cultures Vivrières
#4092 228
#Number of interviewers
length(unique(ix.data.cleaned$Interviewer..Person.ID))
#[1] 872
#Let's focus on the areas where the bikes were introduced:
#Gulu, Amuru, Nwoya
unique(ix.data.cleaned$Interviewer..District.Name)
#[1] "Kitgum" "Pader" "Masindi" "Mbale" "Amuru" "Kapchorwa"
#[7] "Kasese" "Bushenyi" "Gulu" "Oyam" "Nwoya" "Mukono"
#[13] "Luwero Nakaseke" "Agago" ""
bike.areas.data = ix.data.cleaned[ix.data.cleaned$Interviewer..District.Name == 'Gulu' | ix.data.cleaned$Interviewer..District.Name == 'Nwoya' | ix.data.cleaned$Interviewer..District.Name == 'Amuru', ]
dim(bike.areas.data)
#[1] 48988 13
unique(bike.areas.data$Interviewer..District.Name)
#[1] "Amuru" "Gulu" "Nwoya"
#ok let's plot this
hist(as.numeric(as.POSIXlt(bike.areas.data$Server.Entry.Seconds)), breaks=500, main="Server Submissions in Areas where Bikes were Distributed")
abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))
#Ok, now I'm curious about this spike I keep seeing in the Server.Entry.Seconds
max(ix.data.cleaned$Server.Entry.Seconds)
#[1] "2011-11-03 21:20:00 EDT"
ix.data.cleaned[which(ix.data.cleaned$Server.Entry.Seconds == max(ix.data.cleaned$Server.Entry.Seconds)), ]
#So all of these are from the same interviewer, in the same place
hist(as.numeric(as.POSIXlt(bike.areas.data$Handset.Submit.Seconds)), breaks=500, main="Handset Submissions in Areas where Bikes were Distributed")
abline(v= as.numeric(as.POSIXlt(bike.start), col="red"))
abline(v= as.numeric(as.POSIXlt(bike.end), col="red"))
#The Plan:
#Look at average visits per day per ckw before
#and after bicycles were introduced
#I'm going to use Handset.Submit.Time
#to find a day, I will add a column to each interaction indicating
#the unique day since 1/1/1900 = years since 1900 * 365 + day of the year
#add the new day for each column
for (i in 1:nrow(bike.areas.data)) {
thisDate = bike.areas.data[i, "Handset.Submit.Seconds"]
thisUniqueDay = as.POSIXlt(thisDate)$year * 365 + as.POSIXlt(thisDate)$yday
bike.areas.data[i, "day"] = thisUniqueDay
}
#this function took forever! There's probably a better way to do this... but it worked!
max(bike.areas.data$day) - min(bike.areas.data$day)
#[1] 405
#we have data over 405 days
#subsetting the data
sub.ckws.days = bike.areas.data[,c("Interviewer..Person.ID", "day")]
names(sub.ckws.days) = c("person", "day") #simplifying names
#how about we get only the people who started before this data was collected
#convert Active.Date to a proper timestamp I can work with
Active.Date.Proper = strptime(ckws$Active.Date, format="%m/%d/%y")
ckws$Active.Date.Proper = Active.Date.Proper
#for simplicity I will use the Sept 2nd 011 as the day for bikes
#(instead of both the 2nd and 3rd)
#merge the data sets
exchanges.ckws <- merge(sub.ckws.days, ckws, by.y = "Person.ID", by.x="Interviewer..Person.ID"))
old.Gs = exchanges.ckws[which(exchanges.ckws$Active.Date.Proper >= min(exchanges.ckws$day)), c("person", "day", "District", "Active.Date.Proper")]
dim(old.Gs)
#[1] 48896 4
#turns out the ones we got rid of are NAs, not that the started after this dataset was created.
length(which(is.na(exchanges.ckws$Active.Date.Proper)))
#[1] 213
#renaming
biking.ckws = old.Gs
length(unique(biking.ckws$person))
#[1] 101
#Let's try with one person
Person.000983.Data = biking.ckws[which(biking.ckws$person == "Person-000983"),]
Person.000983.Data
table(Person.000983.Data$day)
plot(table(Person.000983.Data$day))
#convert bike.start to my crazy unique day formula
bike.start.uniqueday = as.POSIXlt(bike.start)$year * 365 + as.POSIXlt(bike.start)$yday
bike.start.uniqueday
#[1] 40759
abline(v=bike.start.uniqueday, col="red")
#all of them
plot(table(biking.ckws$day), main="CKWs' Activity in Bike Areas")
abline(v=bike.start.uniqueday, col="red")
#or this maybe
h = hist(biking.ckws$day, breaks=500)
#Let's get into a Time Series
a <- acf(biking.ckws$day, lag.max=30) #since we're dealing with days, I tried a max of a month
#there's a strong chance of a frequency every 6 days
days <- ts(h$counts, frequency=6)
plot(parts)
abline(v=bike.start.uniqueday, col="red") #this didn't work
#there still seems to be a cycle in the trend
# I tried 30 days
days <- ts(h$counts, frequency=30)
parts <- decompose(days)
plot(parts)
abline(v=bike.start.uniqueday, col="red")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment