milesgrimshaw/gist:5523271

## gistfile1.r
### Miles Grimshaw Code For Blog Post on Personal Data
### May 5th 2013

getwd()
setwd("~/Desktop/x_Final_Project")

library(stringr)
library(lubridate)
library(ggplot2)

##############   TWITTER   ##################

t <- read.csv("./Data/tweets.csv", header=TRUE, as.is=TRUE)
names(t)
t$timestamp[1:10]

### Elininate to the date and time
t$timestamp_clean <- str_sub(t$timestamp, 1, -7)
t$timestamp_clean[1:10]

### Make understandable as a date by R
t$tweets_datetime <- format(ymd_hms(t$timestamp_clean, quiet = TRUE), tz="America/New_York",usetz=TRUE)
t$tweets_datetime[1:10]

t$date <- substring(t$timestamp, 1, 10)
t$date[1:10]
t$date <- as.POSIXct(t$date,format='%Y-%m-%d', tz="America/New_York")

### Plot of tweets over time
pdf(file="Tweets_Over_Time.pdf",width=11,height=8.5)
ggplot(t, aes(x=date)) + geom_histogram(binwidth = 60*60*24*7, aes(fill = ..count..)) +scale_fill_gradient("Count", low = "skyblue", high = "blue") + xlab("Date") + ggtitle("Miles' Twitter output by week")
dev.off()

### Create day of the week and hour variables
t$day_of_week <- wday(t$tweets_datetime, label = TRUE, abbr = FALSE)
t$day_of_week[1:10]
t$hour <- hour(t$tweets_datetime)
t$hour[1:10]
t$am_pm <- ifelse(am(t$tweets_datetime), "AM", "PM")
t$am_pm[1:10]

### AM VS PM BY DAY OF WEEK
pdf(file="Tweets_AMPM_Day.pdf",width=11,height=8.5)
counts <- table(t$am_pm, t$day_of_week)
barplot(counts, beside=TRUE, xlab="Day of the Week", ylab="Count",
        main="Tweets by Weekday Seperated by AM vs PM", cex.names=0.8, col=c('lightblue', 'navy'))
legend(x=0.5,y=300, rownames(counts), fill=c('lightblue','navy'), bty='n',
       adj=0.4, cex=1.2)
dev.off()
### Distribution by Day

which(is.na(t$day_of_week))
t <- t[which(!is.na(t$day_of_week)),]
pdf(file="Tweets_Hours_Day.pdf",width=11,height=8.5)
ggplot(t, aes(hour)) + geom_bar() + facet_wrap(~ day_of_week) +
  ggtitle("Miles' Tweets by Hour and Day of The Week") + xlab("Hour") + ylab("Count")
dev.off()

daytime <- table(t$hour, t$day_of_week)
dfdaytime <- as.data.frame(daytime)
pdf(file="Tweets_Heatmap.pdf",width=11,height=8.5)
ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() +
  scale_fill_gradient(low="white", high="red") +
  ggtitle("Heatmap of Miles' Tweets") + xlab("Day of week") + ylab ("Hour ofthe Day")
dev.off()
### WORD CLOURS
library(tm)
library(wordcloud)

# Clean up tweets
text <- tolower(t$text)
text <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text)
## Might not want to get rid of people for wordcloud
text <- gsub("@\\w+", "", text)


# remove html links
text <- gsub("http\\w+", "", text)
text <- removeWords(text, stopwords(kind="en"))
text <- removeWords(text, c("rt", "milesgrimshaw", "http", "twitter"))
text <- removePunctuation(text)
text <- removeNumbers(text)
## OR
# text <- gsub("[[:punct:]]", "", text)
# text <- gsub("[[:digit:]]", "", text)
text <- stripWhitespace(text)
# text <- gsub("[ \t]{2,}", "", text)
# text <- gsub("^\\s+|\\s+$", "", text)

# To use w/ TM package have to create a corpus
# and from that document term matrix (DTM)
text_corp <- Corpus(VectorSource(text))
text_dtm <- DocumentTermMatrix(text_corp)
inspect(text_dtm[1:2,50:60])
findFreqTerms(text_dtm, 20)

word_freq <- colSums(as.array(text_dtm))
word_freq[1:20]

# Create wordcloud of top 100 most frequent words
pdf(file="Tweets_Wordcloud.pdf",width=11,height=8.5)
wordcloud(names(word_freq), word_freq, scale=c(3.2,.2),
          max.words = 100, random.order=FALSE,
          colors=c("gray", "darkgreen", "darkblue"), rot.per=.3)
dev.off()

##############   FACEBOOK   ##################

### FUNCTIONS
get_day <- function (x) {
  if (grepl('Monday', x)) return (1)
  else if (grepl('Tuesday', x)) return (2)
  else if (grepl('Wednesday', x)) return (3)
  else if (grepl('Thursday', x)) return (4)
  else if (grepl('Friday', x)) return (5)
  else if (grepl('Saturday', x)) return (6)
  else if (grepl('Sunday', x)) return (7)
  else return (NA)
}

get_time <- function (x) {
  n <- nchar(x)
  time <- substring(x,(n-11),n)
  time <- gsub("</td>", "", time)
  return (time)
}

## Read in the html
f <- scan("./Data/Facebook/html/account_activity.html", what="", sep="\n")
f2 <- unlist(strsplit(f, '<td>', fixed = TRUE))
f2 <- f2[-1]
head(f2)
# The dates are every 5th column
f2[c(1,6,11,16,21)]
l <- length(f2)
data <- f2[seq(1, length(f2), 5)]
which(is.na(data))
l <- length(data)
data[1:30]

## Manually create factors for 'today' and 'yesterday'
day1 <- rep(0,13)
## 1 = Monday
## 7 = Sunday
## The first 9 days say 'Today' which is Thursday
day1[1:9] <- rep(4)
day1[10:13] <- rep(3)

data_subset <- data[14:l]
data_subset[1:10]
l_sub <- length(data_subset)

day2 <- rep(NA,l_sub)
which(is.na(data_subset))

## Get the dates
day2 <- lapply(data_subset, get_day)
day2 <- unlist(day2)

## Bind the manually created dates and stripped dates together
day <- c(day1,day2)
count <- table(day)
barplot(count)

## Get the time
# l is length of original data
time <- rep(NA,l)
time <- lapply(data, get_time)
time <- unlist(time)

## Convert AM/PM to 24hr understandable by R
time <- strptime(time, "%I:%M%p", tz="America/New_York")

## Create a data frame with the days and times aligned
ff <- data.frame(day, time)
head(ff)

## Create factors and rename the factor levels for the day of the week
ff$day2 <- factor(ff$day)
levels(ff$day2) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

## Strip the time down to the hour
ff$hour <- hour(ff$time)
ff$hour[1:10]

### Distribution by Day
pdf(file="Facebook_Day_Time.pdf",width=11,height=8.5)
ggplot(ff, aes(hour)) + geom_bar() + facet_wrap(~ day2) +
  ggtitle("Miles' Facebook Usage by Hour and Day of The Week") +
  xlab("Hour of the Day") + ylab("Count")
dev.off()

## Create a new data frame so can make the heatmap
daytime <- table(ff$hour, ff$day2)
dfdaytime <- as.data.frame(daytime)
pdf(file="Facebook_Heatmap.pdf",width=11,height=8.5)
ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() +
  scale_fill_gradient(low="white", high="red") +
  ggtitle("Heatmap of Miles' Facebook Usage") + xlab("Day of week") + ylab ("Hour of day")
dev.off()

##############   OPEN PATHS   ##################

library(maps)
library(geosphere)

## Read in the data
l <- read.csv("./Data/openpaths.csv", header=TRUE, as.is=TRUE)
head(l)
## Only select cases where I actually moved
l2 <- l[which(l$alt!=0),]
head(l2)
which(l2$alt>100)
test <- l2[which(l2$alt>100),]
names(test)
which(is.na(test$lon))


## Get the states I want to map
states <- c('new york', 'connecticut', 'massachusetts', 'new hampshire', 'vermont')
## Map just the specific states
pdf(file="Paths_States.pdf",width=11,height=8.5)
map('county',states, col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01)
## Map the country
#pdf(file="Paths_USA.pdf",width=11,height=8.5)
#map('state',col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01)

for (i in 1:(nrow(l)-1)) {
lat_1 <- test$lat[i]
lon_1 <- test$lon[i]
lat_2 <- test$lat[i+1]
lon_2 <- test$lon[i+1]
if (lat_1 != lat_2 & lon_1 != lon_2) {
inter <- gcIntermediate(c(lon_1, lat_1), c(lon_2, lat_2), n=10, addStartEnd=TRUE)
lines(inter, col="red", lwd=0.6)
}
}
#title("Miles' Movements March 2012 - May 2013")
title("Miles' North East Movements March 2012 - May 2013")
dev.off()
#### SOURCES & HAT TIPS ###

http://www.scienceisdelicious.net/?p=407
	### Miles Grimshaw Code For Blog Post on Personal Data
	### May 5th 2013

	getwd()
	setwd("~/Desktop/x_Final_Project")

	library(stringr)
	library(lubridate)
	library(ggplot2)

	############## TWITTER ##################

	t <- read.csv("./Data/tweets.csv", header=TRUE, as.is=TRUE)
	names(t)
	t$timestamp[1:10]

	### Elininate to the date and time
	t$timestamp_clean <- str_sub(t$timestamp, 1, -7)
	t$timestamp_clean[1:10]

	### Make understandable as a date by R
	t$tweets_datetime <- format(ymd_hms(t$timestamp_clean, quiet = TRUE), tz="America/New_York",usetz=TRUE)
	t$tweets_datetime[1:10]

	t$date <- substring(t$timestamp, 1, 10)
	t$date[1:10]
	t$date <- as.POSIXct(t$date,format='%Y-%m-%d', tz="America/New_York")

	### Plot of tweets over time
	pdf(file="Tweets_Over_Time.pdf",width=11,height=8.5)
	ggplot(t, aes(x=date)) + geom_histogram(binwidth = 606024*7, aes(fill = ..count..)) +scale_fill_gradient("Count", low = "skyblue", high = "blue") + xlab("Date") + ggtitle("Miles' Twitter output by week")
	dev.off()

	### Create day of the week and hour variables
	t$day_of_week <- wday(t$tweets_datetime, label = TRUE, abbr = FALSE)
	t$day_of_week[1:10]
	t$hour <- hour(t$tweets_datetime)
	t$hour[1:10]
	t$am_pm <- ifelse(am(t$tweets_datetime), "AM", "PM")
	t$am_pm[1:10]

	### AM VS PM BY DAY OF WEEK
	pdf(file="Tweets_AMPM_Day.pdf",width=11,height=8.5)
	counts <- table(t$am_pm, t$day_of_week)
	barplot(counts, beside=TRUE, xlab="Day of the Week", ylab="Count",
	main="Tweets by Weekday Seperated by AM vs PM", cex.names=0.8, col=c('lightblue', 'navy'))
	legend(x=0.5,y=300, rownames(counts), fill=c('lightblue','navy'), bty='n',
	adj=0.4, cex=1.2)
	dev.off()
	### Distribution by Day

	which(is.na(t$day_of_week))
	t <- t[which(!is.na(t$day_of_week)),]
	pdf(file="Tweets_Hours_Day.pdf",width=11,height=8.5)
	ggplot(t, aes(hour)) + geom_bar() + facet_wrap(~ day_of_week) +
	ggtitle("Miles' Tweets by Hour and Day of The Week") + xlab("Hour") + ylab("Count")
	dev.off()

	daytime <- table(t$hour, t$day_of_week)
	dfdaytime <- as.data.frame(daytime)
	pdf(file="Tweets_Heatmap.pdf",width=11,height=8.5)
	ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() +
	scale_fill_gradient(low="white", high="red") +
	ggtitle("Heatmap of Miles' Tweets") + xlab("Day of week") + ylab ("Hour ofthe Day")
	dev.off()
	### WORD CLOURS
	library(tm)
	library(wordcloud)

	# Clean up tweets
	text <- tolower(t$text)
	text <- gsub("(RT\|via)((?:\\b\\W*@\\w+)+)", "", text)
	## Might not want to get rid of people for wordcloud
	text <- gsub("@\\w+", "", text)


	# remove html links
	text <- gsub("http\\w+", "", text)
	text <- removeWords(text, stopwords(kind="en"))
	text <- removeWords(text, c("rt", "milesgrimshaw", "http", "twitter"))
	text <- removePunctuation(text)
	text <- removeNumbers(text)
	## OR
	# text <- gsub("[[:punct:]]", "", text)
	# text <- gsub("[[:digit:]]", "", text)
	text <- stripWhitespace(text)
	# text <- gsub("[ \t]{2,}", "", text)
	# text <- gsub("^\\s+\|\\s+$", "", text)

	# To use w/ TM package have to create a corpus
	# and from that document term matrix (DTM)
	text_corp <- Corpus(VectorSource(text))
	text_dtm <- DocumentTermMatrix(text_corp)
	inspect(text_dtm[1:2,50:60])
	findFreqTerms(text_dtm, 20)

	word_freq <- colSums(as.array(text_dtm))
	word_freq[1:20]

	# Create wordcloud of top 100 most frequent words
	pdf(file="Tweets_Wordcloud.pdf",width=11,height=8.5)
	wordcloud(names(word_freq), word_freq, scale=c(3.2,.2),
	max.words = 100, random.order=FALSE,
	colors=c("gray", "darkgreen", "darkblue"), rot.per=.3)
	dev.off()

	############## FACEBOOK ##################

	### FUNCTIONS
	get_day <- function (x) {
	if (grepl('Monday', x)) return (1)
	else if (grepl('Tuesday', x)) return (2)
	else if (grepl('Wednesday', x)) return (3)
	else if (grepl('Thursday', x)) return (4)
	else if (grepl('Friday', x)) return (5)
	else if (grepl('Saturday', x)) return (6)
	else if (grepl('Sunday', x)) return (7)
	else return (NA)
	}

	get_time <- function (x) {
	n <- nchar(x)
	time <- substring(x,(n-11),n)
	time <- gsub("</td>", "", time)
	return (time)
	}

	## Read in the html
	f <- scan("./Data/Facebook/html/account_activity.html", what="", sep="\n")
	f2 <- unlist(strsplit(f, '<td>', fixed = TRUE))
	f2 <- f2[-1]
	head(f2)
	# The dates are every 5th column
	f2[c(1,6,11,16,21)]
	l <- length(f2)
	data <- f2[seq(1, length(f2), 5)]
	which(is.na(data))
	l <- length(data)
	data[1:30]

	## Manually create factors for 'today' and 'yesterday'
	day1 <- rep(0,13)
	## 1 = Monday
	## 7 = Sunday
	## The first 9 days say 'Today' which is Thursday
	day1[1:9] <- rep(4)
	day1[10:13] <- rep(3)

	data_subset <- data[14:l]
	data_subset[1:10]
	l_sub <- length(data_subset)

	day2 <- rep(NA,l_sub)
	which(is.na(data_subset))

	## Get the dates
	day2 <- lapply(data_subset, get_day)
	day2 <- unlist(day2)

	## Bind the manually created dates and stripped dates together
	day <- c(day1,day2)
	count <- table(day)
	barplot(count)

	## Get the time
	# l is length of original data
	time <- rep(NA,l)
	time <- lapply(data, get_time)
	time <- unlist(time)

	## Convert AM/PM to 24hr understandable by R
	time <- strptime(time, "%I:%M%p", tz="America/New_York")

	## Create a data frame with the days and times aligned
	ff <- data.frame(day, time)
	head(ff)

	## Create factors and rename the factor levels for the day of the week
	ff$day2 <- factor(ff$day)
	levels(ff$day2) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

	## Strip the time down to the hour
	ff$hour <- hour(ff$time)
	ff$hour[1:10]

	### Distribution by Day
	pdf(file="Facebook_Day_Time.pdf",width=11,height=8.5)
	ggplot(ff, aes(hour)) + geom_bar() + facet_wrap(~ day2) +
	ggtitle("Miles' Facebook Usage by Hour and Day of The Week") +
	xlab("Hour of the Day") + ylab("Count")
	dev.off()

	## Create a new data frame so can make the heatmap
	daytime <- table(ff$hour, ff$day2)
	dfdaytime <- as.data.frame(daytime)
	pdf(file="Facebook_Heatmap.pdf",width=11,height=8.5)
	ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() +
	scale_fill_gradient(low="white", high="red") +
	ggtitle("Heatmap of Miles' Facebook Usage") + xlab("Day of week") + ylab ("Hour of day")
	dev.off()

	############## OPEN PATHS ##################

	library(maps)
	library(geosphere)

	## Read in the data
	l <- read.csv("./Data/openpaths.csv", header=TRUE, as.is=TRUE)
	head(l)
	## Only select cases where I actually moved
	l2 <- l[which(l$alt!=0),]
	head(l2)
	which(l2$alt>100)
	test <- l2[which(l2$alt>100),]
	names(test)
	which(is.na(test$lon))


	## Get the states I want to map
	states <- c('new york', 'connecticut', 'massachusetts', 'new hampshire', 'vermont')
	## Map just the specific states
	pdf(file="Paths_States.pdf",width=11,height=8.5)
	map('county',states, col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01)
	## Map the country
	#pdf(file="Paths_USA.pdf",width=11,height=8.5)
	#map('state',col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01)

	for (i in 1:(nrow(l)-1)) {
	lat_1 <- test$lat[i]
	lon_1 <- test$lon[i]
	lat_2 <- test$lat[i+1]
	lon_2 <- test$lon[i+1]
	if (lat_1 != lat_2 & lon_1 != lon_2) {
	inter <- gcIntermediate(c(lon_1, lat_1), c(lon_2, lat_2), n=10, addStartEnd=TRUE)
	lines(inter, col="red", lwd=0.6)
	}
	}
	#title("Miles' Movements March 2012 - May 2013")
	title("Miles' North East Movements March 2012 - May 2013")
	dev.off()
	#### SOURCES & HAT TIPS ###

	http://www.scienceisdelicious.net/?p=407