# Load packages | |
library(ggplot2) | |
library(lubridate) | |
library(stringr) | |
library(scales) | |
# Set the working directory | |
getwd() | |
setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/files/") | |
######### FUNCTIONS ############ | |
# Receives a file | |
# Returns an array with all the dates extracted | |
get_dates_counts <- function(file) { | |
# Create an empty array for storing the dates | |
times <- vector() | |
# Find all the lines that have 'created' in them which therefore includes dates | |
# Notes go from <note> to </note> | |
note_begins <- grep('.*<note>.*',file) | |
note_ends <- grep('.*</note>.*',file) | |
# All the lines for when notes were created | |
time_lines <- grep('.*<created>([0-9]+T[0-9]+Z)?</created>.*',file) | |
# Select just those lines | |
select_time_lines <- file[time_lines] | |
# Extract the created date for each of them | |
times <- sapply(select_time_lines, function(l) gsub('.*<created>([0-9]+T[0-9]+Z)?</created>.*','\\1', l), USE.NAMES=FALSE) | |
## Get the word count for each note | |
# Create an empty vector for storing the word counts for each note | |
counts <- vector() | |
# Iterate over the lines of each note | |
for (z in 1:length(lines)) { | |
# Set beginning and end of lines for the given note | |
note_begin <- note_begins[z] | |
note_end <- note_ends[z] | |
note_lines <- file[note_begin:note_end] | |
# Have to get rid of any data files | |
# Find the beginning of data sections | |
data_start <- grep('.*<data .*',note_lines) | |
data_end <- grep('.*</data>.*',note_lines) | |
text_note_lines <- vector() | |
# Delete lines between them | |
if (length(data_start)>=1) { | |
for (y in 1:(length(data_start)+1)) { | |
if (y==1) { | |
begin <- 1 | |
end <- data_start[y]-1 | |
text_note_lines <- c(text_note_lines,note_lines[begin:end]) | |
} | |
else if (y==(length(data_start)+1)) { | |
begin <- data_end[y-1]+1 | |
end <- length(note_lines) | |
text_note_lines <- c(text_note_lines,note_lines[begin:end]) | |
} | |
else { | |
begin <- data_end[y-1]+1 | |
end <- data_start[y]-1 | |
if (begin < end) { | |
text_note_lines <- c(text_note_lines,note_lines[begin:end]) | |
} | |
else if (begin == end) { | |
text_note_lines <- c(text_note_lines,note_lines[begin]) | |
} | |
} | |
} | |
} else text_note_lines <- note_lines | |
# Have to get rid of html chars | |
text_note_lines <- sapply(text_note_lines, function(t) gsub("<(.*?)>",'', t), USE.NAMES=FALSE) | |
# Count the number of words per line | |
nums <- sapply(text_note_lines, function(t) length(str_match_all(t,"\\S+")[[1]]), USE.NAMES=FALSE) | |
# Sum over all the lines | |
counts <- append(counts,sum(nums)) | |
} | |
# Create a data frame from times and counts | |
df_new <- data.frame(times,counts) | |
# Return the array of dates | |
return (df_new) | |
} | |
######### SCRIPT ############ | |
# Get all the files from the archive | |
files <- list.files() | |
# Get the total number of files | |
num <- length(files) | |
# Create an empty data frame to store all the dates and word counts | |
df <- data.frame(time=vector(),count=vector()) | |
# Iterate over every file to compile a single array with all the dates | |
for (i in 1:num) { | |
file <- scan(files[i],what="", sep="\n") | |
df <- rbind(df, get_dates_counts(file)) | |
} | |
# Rename the data frame columns | |
names(df) <- c("time", "count") | |
# Create a formatted time stamp | |
df$time <- as.POSIXct(sapply(df$time, function(t) as.POSIXct(t,format="%Y%m%dT%H%M%SZ", tz="GMT"), USE.NAMES=FALSE), origin="1970-01-01") | |
# Create an additional column to store the cumulative number of notes created over time | |
df$note_count <- sapply(df$time, function(t) sum(df$time <= t),USE.NAMES=FALSE) | |
# Create an additional column to store the cumulative sum of words written over time | |
df$word_count <- sapply(df$time, function(t) sum(df$count[which(df$time <= t)]) ) | |
# To customer scale the X-axis time stamp labels needs to be in Date vs Posix format | |
df_2 <- df | |
df_2$time <- as.Date(df_2$time) | |
# Reset the working director for saving images | |
setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/") | |
# Create a graphic of the number of notes I have created by week | |
pdf(file="New_Notes_By_Week.pdf",width=11,height=8.5) | |
ggplot(df, aes(x=time)) + geom_histogram(binwidth = 60*60*24*7,fill="blue") + | |
xlab("Date") + ylab("Number Of New Notes") + ggtitle("Miles' New Evernotes By Week") | |
dev.off() | |
# To calculate the linear regression have to zero the x-axis time stamp | |
df_3 <- df_2 | |
df_3$time <- as.double(df_3$time) | |
df_3$time <- df_3$time - min(df_3$time) | |
# Create a linear model for number of notes by time | |
fit <- lm(note_count ~ 0 + time, data=df_3) | |
summary(fit) | |
z <- coef(fit) | |
# Graph the total number of notes created over time | |
pdf(file="Total_Notes_Over_Time.pdf",width=22,height=17) | |
ggplot(df_2, aes(time, note_count)) + geom_point() + | |
ylab("Total Number of Notes") + xlab("Date") + ggtitle("Total Number Of Notes Created Over Time") + | |
scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) + | |
geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") + | |
geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash") | |
dev.off() | |
# Create a linear regression for words vs. time | |
fit <- lm(word_count ~ 0 + time, data=df_3) | |
summary(fit) | |
z <- coef(fit) | |
# Visualize the total number of words vs. time | |
pdf(file="Total_Words_Over_Time.pdf",width=22,height=17) | |
ggplot(df_2, aes(time, word_count)) + geom_point() + | |
ylab("Total Number of Words") + xlab("Date") + ggtitle("Total Number Of Words Written Over Time") + | |
scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) + | |
geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") + | |
geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash") | |
dev.off() | |
# Create a linear regression for word count vs. note count | |
fit <- lm(word_count ~ 0 + note_count, data=df_2) | |
summary(fit) | |
z <- coef(fit) | |
# How many notebooks did I have when I became an enterprise client | |
# The 18th is the closest date I have for having created a notebook | |
num_notes <- df_2$note_count[which(df_2$time == as.Date("2013-10-18"))] | |
# Visualize the number of words vs. number of notes by time | |
pdf(file="Total_Words_Over_Notebooks.pdf",width=22,height=17) | |
ggplot(df_2, aes(note_count, word_count)) + geom_point() + | |
ylab("Total Number of Words") + xlab("Notes") + ggtitle("Total Number Of Words Written vs. Number Of Notes") + | |
geom_abline(slope=z, colour="red") + | |
geom_vline(xintercept = num_notes, colour="blue", linetype = "longdash") | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment