|
# related to my question here: https://github.com/tidyverse/googledrive/issues/218 |
|
|
|
# How to get access to specific revisions of a google drive document? Yes, we can do that. |
|
|
|
# How to get the username for each revision, and the size of the document at each revision (bytes or words, assuming we are specifically talking about docs and not sheets or other types of files) Yes, we can do that. |
|
|
|
# Goal is to measure the contributions of each author to a collaboratively-authored google doc with as little work as possible |
|
# How to get access to specific revisions of a google drive document? |
|
|
|
# Getting the revisions for a Google Doc |
|
|
|
# Limitations: |
|
# - When many users edit at the same time, we only get the name of the first editor active in that session. The others are not captured. |
|
# - When many edits happen in a short period of time, these are combined by Google and we cannot see them individually. We do not have good temporal resolution. |
|
# - Older edits are deleted by Google to save space. We do not know what the rules are for this. |
|
|
|
|
|
library(googledrive) |
|
library(tidyverse) |
|
|
|
# replace this with the ID of your google doc |
|
fileId <- "1s0CPFXnMQjZNts6gYAnkcGXGSAgugTupzMf8YeoCbps" |
|
|
|
# Get the name of the file and some other metadata |
|
file <- build_request( |
|
path = "drive/v3/files/{fileId}", |
|
method = "GET", |
|
params = list( |
|
fileId = fileId, |
|
fields = "*" |
|
), |
|
token = drive_token() |
|
) |
|
file_ret <- process_response(make_request(file)) |
|
|
|
# Now for this doc, query the Drive API to get get URLs and other meta-data for all the revisions available to us |
|
|
|
req2 <- build_request( |
|
path = "drive/v2/files/{fileId}/revisions", |
|
method = "GET", |
|
params = list( |
|
fileId = fileId |
|
), |
|
token = drive_token() |
|
) |
|
revs2 <- process_response(make_request(req2)) |
|
|
|
# See |
|
# https://developers.google.com/drive/api/v2/reference/revisions#resource |
|
# for an explanation of each variable that we have here |
|
|
|
# tidy revisions into a dataframe |
|
revs2_df <- |
|
map_df( |
|
revs2$items, |
|
`[`, |
|
c( |
|
"kind", |
|
"etag" , |
|
"id", |
|
"selfLink" , |
|
"mimeType" , |
|
"modifiedDate", |
|
"published" , |
|
"lastModifyingUserName" |
|
) |
|
) |
|
# get exportLinks URLs out of its nest |
|
revs2_export_url <- map_df(revs2$items, "exportLinks") |
|
# bind together |
|
revs2_df_bind <- bind_cols(revs2_df, revs2_export_url) |
|
|
|
# Now we have a dataframe of edits that includes the date of the edit, the person who did the edit, and URLs to download the document as it was at that edit. |
|
|
|
# Here are the download URLs, eg for plain text: |
|
|
|
paths <- revs2_df_bind$`text/plain` |
|
rev_ids <- revs2_df_bind$id |
|
|
|
# I have found only one way to get the contents of these versions, and that is to download them using browseURL. This is very bad because we can't control the download location or the destination filename. And our system is jammed up while the broswer is busy. Furthermore, we get blocked if we download too fast, so we have to sleep for 1 sec in between. Even more annoying is that Chrome stops autonumerbing downloads with the same name at 100. That's a real pain if there are >100 revisions! Safari can do >100, but uses a different file-naming system. There must be a better way! |
|
# Note that we need to be logged into to Google Drive in the browser before we attempt to download these files. |
|
|
|
# this will download the plain text file for each revision |
|
map(paths, function(x) {browseURL(x); Sys.sleep(1)}) |
|
|
|
# read in the files that we got from the versions |
|
|
|
rev_file_names <- list.files("~/Downloads", # not ideal... |
|
pattern = file_ret$name, |
|
full.names = TRUE) |
|
|
|
# get files in a useful order... |
|
# get the rev idx, this depends on the browser, Chrome does (n).txt, |
|
# Safari does -n.txt. Also we need to deal with the |
|
# first download which has no number |
|
|
|
|
|
#------ broswer specific actions --------- |
|
# if using chrome... |
|
rev_file_names_indx <- |
|
map(rev_file_names, |
|
~gsub("\\(([^()]+)\\)", |
|
"\\1", |
|
str_extract_all(.x, "\\(([^()]+)\\)")[[1]])) %>% |
|
map_dbl(., ~ifelse(length(.x) == 0, |
|
0, as.numeric(.x))) |
|
|
|
# if using safari |
|
rev_file_names_indx <- |
|
map(rev_file_names, |
|
~.x %>% str_extract_all("-\\d*") %>% |
|
str_remove_all("-")) %>% |
|
map_dbl(., ~ifelse(identical(.x, "character(0)"), |
|
1, as.numeric(.x))) |
|
#------ end of broswer specific actions --------- |
|
|
|
# into a data frame |
|
rev_file_names_df <- |
|
data_frame(file_name = rev_file_names, |
|
idx = rev_file_names_indx) %>% |
|
arrange(idx) |
|
|
|
# name of user for each revision is in, only get as many names as |
|
# we could download files, in case that was rate limited: |
|
|
|
user_names <- revs2_df_bind$lastModifyingUserName[1:nrow(rev_file_names_df)] |
|
modifiedDate <- revs2_df_bind$modifiedDate[1:nrow(rev_file_names_df)] |
|
|
|
# how to quantify difference between revisions? Word count seems |
|
# resonable for simple text documents |
|
|
|
rev_file_names_df_word_counts <- |
|
rev_file_names_df %>% |
|
nest(-idx) %>% |
|
mutate(word_count = map_int(data, |
|
~readr::read_file(.x$file_name) %>% |
|
str_count(., '\\w+'))) %>% |
|
select(-data) %>% |
|
mutate(diff_count = word_count - lag(word_count)) %>% |
|
mutate(lastModifyingUserName = user_names, |
|
modifiedDate = modifiedDate) %>% |
|
separate(modifiedDate, |
|
into= c("ymd", "time"), |
|
sep = "T") %>% |
|
mutate(ymd = as.POSIXct(ymd)) %>% |
|
mutate(date_time = as.POSIXct(paste(ymd, |
|
time), |
|
format="%Y-%m-%d %H:%M:%S")) |
|
|
|
# clean up by deleting all the downloads, now that we've read them in |
|
unlink(rev_file_names) |
|
|
|
# now we've got a table of the revisions, with a user |
|
# name for each revision, and a word count diff |
|
#------------------------------------------------------------------- |
|
|
|
# EDA |
|
|
|
# Let's explore it a little with some plots |
|
|
|
# total contribution by each author |
|
rev_file_names_df_word_counts %>% |
|
group_by(lastModifyingUserName) %>% |
|
summarise(total_words_added = sum(diff_count, na.rm = TRUE)) %>% |
|
ggplot(aes(reorder(lastModifyingUserName, |
|
-total_words_added), |
|
total_words_added)) + |
|
geom_col() + |
|
xlab("Author") + |
|
ylab("Total number of words contributed to the document") + |
|
theme_minimal() |
|
|
|
# change in document size over time |
|
library(lubridate) |
|
rev_file_names_df_word_counts %>% |
|
ggplot(aes(ymd, |
|
diff_count, |
|
fill = lastModifyingUserName)) + |
|
geom_col() + |
|
scale_x_datetime(date_labels = "%b %Y") + |
|
xlab("Date") + |
|
ylab("Change in the number of words") + |
|
theme_minimal() |
|
|
|
|
|
|
|
# Sniffing the web traffic |
|
# when using the gdocs 'view revision history' shows that 'end' and 'ar' |
|
# are GET variables that refers to the revision ID. 'end' is the revision |
|
# ID of the rev that we want to look at when we click a rev on the list |
|
# on the RHS. Can we make a GET request that uses this 'end' variable with |
|
# googledrive? |
|
|
|
|
|
|
For my private doc 1s0CPFXnMQjZNts6gYAnkcGXGSAgugTupzMf8YeoCbps, I get: