milesgrimshaw/gist:8941929

## gistfile1.r
# Load packages
library(ggplot2)
library(lubridate)
library(stringr)
library(scales)

# Set the working directory
getwd()
setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/files/")

#########   FUNCTIONS   ############

# Receives a file
# Returns an array with all the dates extracted
get_dates_counts <- function(file) {

  # Create an empty array for storing the dates
  times <- vector()

  # Find all the lines that have 'created' in them which therefore includes dates

  # Notes go from <note> to </note>
  note_begins <- grep('.*<note>.*',file)
  note_ends <- grep('.*</note>.*',file)

  # All the lines for when notes were created
  time_lines <- grep('.*<created>([0-9]+T[0-9]+Z)?</created>.*',file)

  # Select just those lines
  select_time_lines <- file[time_lines]

  # Extract the created date for each of them
  times <- sapply(select_time_lines, function(l) gsub('.*<created>([0-9]+T[0-9]+Z)?</created>.*','\\1', l), USE.NAMES=FALSE)

  ## Get the word count for each note

  # Create an empty vector for storing the word counts for each note
  counts <- vector()

  # Iterate over the lines of each note
  for (z in 1:length(lines)) {

    # Set beginning and end of lines for the given note
    note_begin <- note_begins[z]
    note_end <- note_ends[z]

    note_lines <- file[note_begin:note_end]

    # Have to get rid of any data files
    # Find the beginning of data sections
    data_start <- grep('.*<data .*',note_lines)
    data_end <- grep('.*</data>.*',note_lines)

    text_note_lines <- vector()
    # Delete lines between them
    if (length(data_start)>=1) {
      for (y in 1:(length(data_start)+1)) {
        if (y==1) {
          begin <- 1
          end <- data_start[y]-1
          text_note_lines <- c(text_note_lines,note_lines[begin:end])
        }
        else if (y==(length(data_start)+1)) {
          begin <- data_end[y-1]+1
          end <- length(note_lines)
          text_note_lines <- c(text_note_lines,note_lines[begin:end])
        }
        else {
          begin <- data_end[y-1]+1
          end <- data_start[y]-1
          if (begin < end) {
            text_note_lines <- c(text_note_lines,note_lines[begin:end])
          }
          else if (begin == end) {
            text_note_lines <- c(text_note_lines,note_lines[begin])
          }
        }
      }
    } else text_note_lines <- note_lines

    # Have to get rid of html chars
    text_note_lines <- sapply(text_note_lines, function(t) gsub("<(.*?)>",'', t), USE.NAMES=FALSE)

    # Count the number of words per line
    nums <- sapply(text_note_lines, function(t) length(str_match_all(t,"\\S+")[[1]]), USE.NAMES=FALSE)
    # Sum over all the lines
    counts <- append(counts,sum(nums))
  }

  # Create a data frame from times and counts
  df_new <- data.frame(times,counts)

  # Return the array of dates
  return (df_new)
}

#########   SCRIPT   ############

# Get all the files from the archive
files <- list.files()

# Get the total number of files
num <- length(files)

# Create an empty data frame to store all the dates and word counts
df <- data.frame(time=vector(),count=vector())

# Iterate over every file to compile a single array with all the dates
for (i in 1:num) {
  file <- scan(files[i],what="", sep="\n")
  df <- rbind(df, get_dates_counts(file))
}

# Rename the data frame columns
names(df) <- c("time", "count")

# Create a formatted time stamp
df$time <- as.POSIXct(sapply(df$time, function(t) as.POSIXct(t,format="%Y%m%dT%H%M%SZ", tz="GMT"), USE.NAMES=FALSE), origin="1970-01-01")

# Create an additional column to store the cumulative number of notes created over time
df$note_count <- sapply(df$time, function(t) sum(df$time <= t),USE.NAMES=FALSE)

# Create an additional column to store the cumulative sum of words written over time
df$word_count <- sapply(df$time, function(t) sum(df$count[which(df$time <= t)]) )

# To customer scale the X-axis time stamp labels needs to be in Date vs Posix format
df_2 <- df
df_2$time <- as.Date(df_2$time)

# Reset the working director for saving images
setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/")

# Create a graphic of the number of notes I have created by week
pdf(file="New_Notes_By_Week.pdf",width=11,height=8.5)
ggplot(df, aes(x=time)) + geom_histogram(binwidth = 60*60*24*7,fill="blue") +
  xlab("Date") + ylab("Number Of New Notes") + ggtitle("Miles' New Evernotes By Week")
dev.off()

# To calculate the linear regression have to zero the x-axis time stamp
df_3 <- df_2
df_3$time <- as.double(df_3$time)
df_3$time <- df_3$time - min(df_3$time)

# Create a linear model for number of notes by time
fit <- lm(note_count ~ 0 + time, data=df_3)
summary(fit)
z <- coef(fit)

# Graph the total number of notes created over time
pdf(file="Total_Notes_Over_Time.pdf",width=22,height=17)
ggplot(df_2, aes(time, note_count)) + geom_point() +
  ylab("Total Number of Notes") + xlab("Date") + ggtitle("Total Number Of Notes Created Over Time") +
  scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) +
  geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") +
  geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash")
dev.off()

# Create a linear regression for words vs. time
fit <- lm(word_count ~ 0 + time, data=df_3)
summary(fit)
z <- coef(fit)

# Visualize the total number of words vs. time
pdf(file="Total_Words_Over_Time.pdf",width=22,height=17)
ggplot(df_2, aes(time, word_count)) + geom_point() +
  ylab("Total Number of Words") + xlab("Date") + ggtitle("Total Number Of Words Written Over Time") +
  scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) +
  geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") +
  geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash")
dev.off()

# Create a linear regression for word count vs. note count
fit <- lm(word_count ~ 0 + note_count, data=df_2)
summary(fit)
z <- coef(fit)

# How many notebooks did I have when I became an enterprise client
# The 18th is the closest date I have for having created a notebook
num_notes <- df_2$note_count[which(df_2$time == as.Date("2013-10-18"))]

# Visualize the number of words vs. number of notes by time
pdf(file="Total_Words_Over_Notebooks.pdf",width=22,height=17)
ggplot(df_2, aes(note_count, word_count)) + geom_point() +
  ylab("Total Number of Words") + xlab("Notes") + ggtitle("Total Number Of Words Written vs. Number Of Notes") +
  geom_abline(slope=z, colour="red") +
  geom_vline(xintercept = num_notes, colour="blue", linetype = "longdash")
dev.off()
	# Load packages
	library(ggplot2)
	library(lubridate)
	library(stringr)
	library(scales)

	# Set the working directory
	getwd()
	setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/files/")

	######### FUNCTIONS ############

	# Receives a file
	# Returns an array with all the dates extracted
	get_dates_counts <- function(file) {

	# Create an empty array for storing the dates
	times <- vector()

	# Find all the lines that have 'created' in them which therefore includes dates

	# Notes go from <note> to </note>
	note_begins <- grep('.<note>.',file)
	note_ends <- grep('.</note>.',file)

	# All the lines for when notes were created
	time_lines <- grep('.<created>([0-9]+T[0-9]+Z)?</created>.',file)

	# Select just those lines
	select_time_lines <- file[time_lines]

	# Extract the created date for each of them
	times <- sapply(select_time_lines, function(l) gsub('.<created>([0-9]+T[0-9]+Z)?</created>.','\\1', l), USE.NAMES=FALSE)

	## Get the word count for each note

	# Create an empty vector for storing the word counts for each note
	counts <- vector()

	# Iterate over the lines of each note
	for (z in 1:length(lines)) {

	# Set beginning and end of lines for the given note
	note_begin <- note_begins[z]
	note_end <- note_ends[z]

	note_lines <- file[note_begin:note_end]

	# Have to get rid of any data files
	# Find the beginning of data sections
	data_start <- grep('.<data .',note_lines)
	data_end <- grep('.</data>.',note_lines)

	text_note_lines <- vector()
	# Delete lines between them
	if (length(data_start)>=1) {
	for (y in 1:(length(data_start)+1)) {
	if (y==1) {
	begin <- 1
	end <- data_start[y]-1
	text_note_lines <- c(text_note_lines,note_lines[begin:end])
	}
	else if (y==(length(data_start)+1)) {
	begin <- data_end[y-1]+1
	end <- length(note_lines)
	text_note_lines <- c(text_note_lines,note_lines[begin:end])
	}
	else {
	begin <- data_end[y-1]+1
	end <- data_start[y]-1
	if (begin < end) {
	text_note_lines <- c(text_note_lines,note_lines[begin:end])
	}
	else if (begin == end) {
	text_note_lines <- c(text_note_lines,note_lines[begin])
	}
	}
	}
	} else text_note_lines <- note_lines

	# Have to get rid of html chars
	text_note_lines <- sapply(text_note_lines, function(t) gsub("<(.*?)>",'', t), USE.NAMES=FALSE)

	# Count the number of words per line
	nums <- sapply(text_note_lines, function(t) length(str_match_all(t,"\\S+")[[1]]), USE.NAMES=FALSE)
	# Sum over all the lines
	counts <- append(counts,sum(nums))
	}

	# Create a data frame from times and counts
	df_new <- data.frame(times,counts)

	# Return the array of dates
	return (df_new)
	}

	######### SCRIPT ############

	# Get all the files from the archive
	files <- list.files()

	# Get the total number of files
	num <- length(files)

	# Create an empty data frame to store all the dates and word counts
	df <- data.frame(time=vector(),count=vector())

	# Iterate over every file to compile a single array with all the dates
	for (i in 1:num) {
	file <- scan(files[i],what="", sep="\n")
	df <- rbind(df, get_dates_counts(file))
	}

	# Rename the data frame columns
	names(df) <- c("time", "count")

	# Create a formatted time stamp
	df$time <- as.POSIXct(sapply(df$time, function(t) as.POSIXct(t,format="%Y%m%dT%H%M%SZ", tz="GMT"), USE.NAMES=FALSE), origin="1970-01-01")

	# Create an additional column to store the cumulative number of notes created over time
	df$note_count <- sapply(df$time, function(t) sum(df$time <= t),USE.NAMES=FALSE)

	# Create an additional column to store the cumulative sum of words written over time
	df$word_count <- sapply(df$time, function(t) sum(df$count[which(df$time <= t)]) )

	# To customer scale the X-axis time stamp labels needs to be in Date vs Posix format
	df_2 <- df
	df_2$time <- as.Date(df_2$time)

	# Reset the working director for saving images
	setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/")

	# Create a graphic of the number of notes I have created by week
	pdf(file="New_Notes_By_Week.pdf",width=11,height=8.5)
	ggplot(df, aes(x=time)) + geom_histogram(binwidth = 606024*7,fill="blue") +
	xlab("Date") + ylab("Number Of New Notes") + ggtitle("Miles' New Evernotes By Week")
	dev.off()

	# To calculate the linear regression have to zero the x-axis time stamp
	df_3 <- df_2
	df_3$time <- as.double(df_3$time)
	df_3$time <- df_3$time - min(df_3$time)

	# Create a linear model for number of notes by time
	fit <- lm(note_count ~ 0 + time, data=df_3)
	summary(fit)
	z <- coef(fit)

	# Graph the total number of notes created over time
	pdf(file="Total_Notes_Over_Time.pdf",width=22,height=17)
	ggplot(df_2, aes(time, note_count)) + geom_point() +
	ylab("Total Number of Notes") + xlab("Date") + ggtitle("Total Number Of Notes Created Over Time") +
	scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) +
	geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") +
	geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash")
	dev.off()

	# Create a linear regression for words vs. time
	fit <- lm(word_count ~ 0 + time, data=df_3)
	summary(fit)
	z <- coef(fit)

	# Visualize the total number of words vs. time
	pdf(file="Total_Words_Over_Time.pdf",width=22,height=17)
	ggplot(df_2, aes(time, word_count)) + geom_point() +
	ylab("Total Number of Words") + xlab("Date") + ggtitle("Total Number Of Words Written Over Time") +
	scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) +
	geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") +
	geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash")
	dev.off()

	# Create a linear regression for word count vs. note count
	fit <- lm(word_count ~ 0 + note_count, data=df_2)
	summary(fit)
	z <- coef(fit)

	# How many notebooks did I have when I became an enterprise client
	# The 18th is the closest date I have for having created a notebook
	num_notes <- df_2$note_count[which(df_2$time == as.Date("2013-10-18"))]

	# Visualize the number of words vs. number of notes by time
	pdf(file="Total_Words_Over_Notebooks.pdf",width=22,height=17)
	ggplot(df_2, aes(note_count, word_count)) + geom_point() +
	ylab("Total Number of Words") + xlab("Notes") + ggtitle("Total Number Of Words Written vs. Number Of Notes") +
	geom_abline(slope=z, colour="red") +
	geom_vline(xintercept = num_notes, colour="blue", linetype = "longdash")
	dev.off()