kdaily/semi-annual-usage-report.Rmd

## semi-annual-usage-report.Rmd
---
title: "AMP-AD Semi-Annual Report"
author: "Kenneth Daily"
date: "`r format(Sys.time(), '%B %d, %Y')`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE,
                      message = FALSE,
                      warning = FALSE)

library(tidyverse)
library(lubridate)
library(synapser)
library(synapseusagereports)

keep_cols <- c('id','DATE','TIMESTAMP',
               'NODE_TYPE','NAME','recordType','date','userId',
               'dateGrouping','monthYear')

synLogin()
```

```{r}
# First, download from S3 or other location the files needed to this directory
data_dir <- "/tmp/usagestats"

download_file <- glue::glue("{data_dir}/another-syn2580853-20190601-20191201-download.csv")
filedownloadrecord_file <- glue::glue("{data_dir}/another-syn2580853-20190601-20191201-filedownloadrecord.csv")

# download_file <- glue::glue("{data_dir}/syn2580853_20181201-20190601-download.csv")
# filedownloadrecord_file <- glue::glue("{data_dir}/syn2580853_20181201-20190601-filedownloadrecord.csv")
```

```{r readdata}
download_data <- read_csv(download_file) %>%
  select(one_of(keep_cols))
fdr_data <- read_csv(filedownloadrecord_file) %>%
  select(one_of(keep_cols)) %>%
  mutate(recordType = 'download')

# Combine download and fdr data
query_data <- rbind(download_data, fdr_data) %>%
  distinct() %>%
  mutate(id = paste0("syn", id),
         userId = as.character(userId),
         quarter = lubridate::quarter(DATE),
         year = lubridate::year(DATE),
         quarteryear = glue::glue("{year} Q{quarter}"))


team_order <- c(3320424, 273957)
user_list <- processTeamMemberList(team_order)

all_users <- getQueryUserProfiles(query_data,
                                  useTeamGrouping = TRUE,
                                  userList = user_list)

query_data <- query_data %>%
  left_join(., all_users)

query_data_single <- query_data # %>%
  # group_by(id, userId) %>%
  # slice(1) %>%
  # ungroup() %>%

# All data files in this file view are the ones we're interested in
file_view_id <- "syn11346063"
file_view_res <- synTableQuery(glue::glue("SELECT id,study from {file_view_id}"))

file_view_df <- file_view_res$asDataFrame() %>%
  tibble::as_tibble() %>%
  select(id, study)

query_data_annotated <- dplyr::left_join(query_data_single,
                                         file_view_df, by = "id")

# Test here for number of entities without a study annotated
# query_data_annotated %>% filter(is.na(study)) %>% summarize(n=n_distinct(id))

query_data_annotated_filtered <- query_data_annotated %>%
  filter(!is.na(study))

```

```{r}
res <- query_data_annotated %>%
  count(teamName) %>%
  pivot_wider(names_from = "teamName", values_from = "n")
```

```{r}
prop_ampad <- res$`AMP-AD Consortium` / (res$`AMP-AD Consortium` + res$`Registered Synapse User`)
```
## Summary

```{r}
query_data_annotated %>%
  summarize(nrecords = n(), nusers = n_distinct(userId), nfiles = n_distinct(id),
            min_date = min(DATE), max_date = max(DATE))
```

```{r}
query_data_annotated_filtered %>%
  summarize(nrecords = n(), nusers = n_distinct(userId), nfiles = n_distinct(id),
            min_date = min(DATE), max_date = max(DATE))
```

Between `r min(query_data_annotated$dateGrouping)` and `r max(query_data_annotated$dateGrouping)` there are:

- `r nrow(query_data_annotated)` downloads.
- `r round(prop_ampad, digits = 2) * 100`% are from the AMP-AD Consortium.

## Downloads by study

```{r}
bystudy <- query_data_annotated %>%
  group_by(study) %>%
  summarize(`Users` = n_distinct(userId),
            Downloads = n(),
            Files = n_distinct(id)) %>%
  ungroup() %>%
  mutate(`Downloads Per File`=Downloads / Files) %>%
  select(Study = study, Files, Users, Downloads, `Downloads Per File`) %>%
  DT::datatable(options = list(pageLength = 75,
                               lengthChange = FALSE))

bystudy
```

```{r}
query_data_annotated %>%
  group_by(dateGrouping) %>%
  summarize(Downloads = n(), Users=n_distinct(userId))
```
	---
	title: "AMP-AD Semi-Annual Report"
	author: "Kenneth Daily"
	date: "`r format(Sys.time(), '%B %d, %Y')`"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = FALSE,
	message = FALSE,
	warning = FALSE)

	library(tidyverse)
	library(lubridate)
	library(synapser)
	library(synapseusagereports)

	keep_cols <- c('id','DATE','TIMESTAMP',
	'NODE_TYPE','NAME','recordType','date','userId',
	'dateGrouping','monthYear')

	synLogin()
	```

	```{r}
	# First, download from S3 or other location the files needed to this directory
	data_dir <- "/tmp/usagestats"

	download_file <- glue::glue("{data_dir}/another-syn2580853-20190601-20191201-download.csv")
	filedownloadrecord_file <- glue::glue("{data_dir}/another-syn2580853-20190601-20191201-filedownloadrecord.csv")

	# download_file <- glue::glue("{data_dir}/syn2580853_20181201-20190601-download.csv")
	# filedownloadrecord_file <- glue::glue("{data_dir}/syn2580853_20181201-20190601-filedownloadrecord.csv")
	```

	```{r readdata}
	download_data <- read_csv(download_file) %>%
	select(one_of(keep_cols))
	fdr_data <- read_csv(filedownloadrecord_file) %>%
	select(one_of(keep_cols)) %>%
	mutate(recordType = 'download')

	# Combine download and fdr data
	query_data <- rbind(download_data, fdr_data) %>%
	distinct() %>%
	mutate(id = paste0("syn", id),
	userId = as.character(userId),
	quarter = lubridate::quarter(DATE),
	year = lubridate::year(DATE),
	quarteryear = glue::glue("{year} Q{quarter}"))


	team_order <- c(3320424, 273957)
	user_list <- processTeamMemberList(team_order)

	all_users <- getQueryUserProfiles(query_data,
	useTeamGrouping = TRUE,
	userList = user_list)

	query_data <- query_data %>%
	left_join(., all_users)

	query_data_single <- query_data # %>%
	# group_by(id, userId) %>%
	# slice(1) %>%
	# ungroup() %>%

	# All data files in this file view are the ones we're interested in
	file_view_id <- "syn11346063"
	file_view_res <- synTableQuery(glue::glue("SELECT id,study from {file_view_id}"))

	file_view_df <- file_view_res$asDataFrame() %>%
	tibble::as_tibble() %>%
	select(id, study)

	query_data_annotated <- dplyr::left_join(query_data_single,
	file_view_df, by = "id")

	# Test here for number of entities without a study annotated
	# query_data_annotated %>% filter(is.na(study)) %>% summarize(n=n_distinct(id))

	query_data_annotated_filtered <- query_data_annotated %>%
	filter(!is.na(study))

	```

	```{r}
	res <- query_data_annotated %>%
	count(teamName) %>%
	pivot_wider(names_from = "teamName", values_from = "n")
	```

	```{r}
	prop_ampad <- res$`AMP-AD Consortium` / (res$`AMP-AD Consortium` + res$`Registered Synapse User`)
	```
	## Summary

	```{r}
	query_data_annotated %>%
	summarize(nrecords = n(), nusers = n_distinct(userId), nfiles = n_distinct(id),
	min_date = min(DATE), max_date = max(DATE))
	```

	```{r}
	query_data_annotated_filtered %>%
	summarize(nrecords = n(), nusers = n_distinct(userId), nfiles = n_distinct(id),
	min_date = min(DATE), max_date = max(DATE))
	```

	Between `r min(query_data_annotated$dateGrouping)` and `r max(query_data_annotated$dateGrouping)` there are:

	- `r nrow(query_data_annotated)` downloads.
	- `r round(prop_ampad, digits = 2) * 100`% are from the AMP-AD Consortium.

	## Downloads by study

	```{r}
	bystudy <- query_data_annotated %>%
	group_by(study) %>%
	summarize(`Users` = n_distinct(userId),
	Downloads = n(),
	Files = n_distinct(id)) %>%
	ungroup() %>%
	mutate(`Downloads Per File`=Downloads / Files) %>%
	select(Study = study, Files, Users, Downloads, `Downloads Per File`) %>%
	DT::datatable(options = list(pageLength = 75,
	lengthChange = FALSE))

	bystudy
	```

	```{r}
	query_data_annotated %>%
	group_by(dateGrouping) %>%
	summarize(Downloads = n(), Users=n_distinct(userId))
	```