Skip to content

Instantly share code, notes, and snippets.

@wmay
Last active November 18, 2019 07:19
Show Gist options
  • Save wmay/4d070445f93025571c7032e8a1bd0572 to your computer and use it in GitHub Desktop.
Save wmay/4d070445f93025571c7032e8a1bd0572 to your computer and use it in GitHub Desktop.
Modern Data Workflow presentation
@article{yenni_developing_2019,
title = {Developing a modern data workflow for regularly updated data},
volume = {17},
issn = {1545-7885},
url = {https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3000125},
doi = {10.1371/journal.pbio.3000125},
abstract = {Over the past decade, biology has undergone a data revolution in how researchers collect data and the amount of data being collected. An emerging challenge that has received limited attention in biology is managing, working with, and providing access to data under continual active collection. Regularly updated data present unique challenges in quality assurance and control, data publication, archiving, and reproducibility. We developed a workflow for a long-term ecological study that addresses many of the challenges associated with managing this type of data. We do this by leveraging existing tools to 1) perform quality assurance and control; 2) import, restructure, version, and archive data; 3) rapidly publish new data in ways that ensure appropriate credit to all contributors; and 4) automate most steps in the data pipeline to reduce the time and effort required by researchers. The workflow leverages tools from software development, including version control and continuous integration, to create a modern data management system that automates the pipeline.},
language = {en},
number = {1},
urldate = {2019-10-31},
journal = {PLOS Biology},
author = {Yenni, Glenda M. and Christensen, Erica M. and Bledsoe, Ellen K. and Supp, Sarah R. and Diaz, Renata M. and White, Ethan P. and Ernest, S. K. Morgan},
month = jan,
year = {2019},
keywords = {Databases, Archives, Biological data management, Data management, Programming languages, Quality control, Reproducibility, Scientists},
pages = {e3000125},
file = {Full Text PDF:/home/wmay/Zotero/storage/STIA7KEW/Yenni et al. - 2019 - Developing a modern data workflow for regularly up.pdf:application/pdf;Snapshot:/home/wmay/Zotero/storage/HFH26RZ9/article.html:text/html}
}
@book{chacon_pro_2014,
address = {Berkely, CA, USA},
edition = {2nd},
title = {Pro {Git}},
isbn = {978-1-4842-0077-3},
abstract = {Pro Git (Second Edition) is your fully-updated guide to Git and its usage in the modern world. Git has come a long way since it was first developed by Linus Torvalds for Linux kernel development. It has taken the open source world by storm since its inception in 2005, and this book teaches you how to use it like a pro. Effective and well-implemented version control is a necessity for successful web projects, whether large or small. With this book youll learn how to master the world of distributed version workflow, use the distributed features of Git to the full, and extend Git to meet your every need. Written by Git pros Scott Chacon and Ben Straub, Pro Git (Second Edition) builds on the hugely successful first edition, and is now fully updated for Git version 2.0, as well as including an indispensable chapter on GitHub. Its the best book for all your Git needs. What youll learn Effectively use Git, either as a programmer or a project leader Become a fluent Git user Master branching, using Git on the server, and on other systems Integrate Git in your development workflow Migrate programming projects from other SCMs to Git Extend Git for your personal project needs Effectively use GitHub Who this book is for This book is for all open source developers: you are bound to encounter Git somewhere in the course of your working life. Proprietary software developers will appreciate Gits enormous scalability, since it is used for the Linux project, which comprises thousands of developers and testers.},
publisher = {Apress},
author = {Chacon, Scott and Straub, Ben},
year = {2014}
}
@article{wilson_best_2014,
title = {Best {Practices} for {Scientific} {Computing}},
volume = {12},
issn = {1545-7885},
url = {https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1001745},
doi = {10.1371/journal.pbio.1001745},
abstract = {We describe a set of best practices for scientific software development, based on research and experience, that will improve scientists' productivity and the reliability of their software.},
language = {en},
number = {1},
urldate = {2019-11-05},
journal = {PLOS Biology},
author = {Wilson, Greg and Aruliah, D. A. and Brown, C. Titus and Hong, Neil P. Chue and Davis, Matt and Guy, Richard T. and Haddock, Steven H. D. and Huff, Kathryn D. and Mitchell, Ian M. and Plumbley, Mark D. and Waugh, Ben and White, Ethan P. and Wilson, Paul},
month = jan,
year = {2014},
keywords = {Programming languages, Scientists, Computer software, Computers, Open source software, Research validity, Software development, Software tools},
pages = {e1001745},
file = {Full Text PDF:/home/wmay/Zotero/storage/VL8NTV2K/Wilson et al. - 2014 - Best Practices for Scientific Computing.pdf:application/pdf;Snapshot:/home/wmay/Zotero/storage/2JH5JDKG/article.html:text/html}
}
@article{teal_data_2015,
title = {Data {Carpentry}: {Workshops} to {Increase} {Data} {Literacy} for {Researchers}},
volume = {10},
copyright = {Copyright (c)},
issn = {1746-8256},
shorttitle = {Data {Carpentry}},
url = {http://www.ijdc.net/article/view/10.1.135},
doi = {10.2218/ijdc.v10i1.351},
abstract = {In many domains the rapid generation of large amounts of data is fundamentally changing how research is done. The deluge of data presents great opportunities, but also many challenges in managing, analyzing and sharing data. However, good training resources for researchers looking to develop skills that will enable them to be more effective and productive researchers are scarce and there is little space in the existing curriculum for courses or additional lectures. To address this need we have developed an introductory two-day intensive workshop, Data Carpentry, designed to teach basic concepts, skills, and tools for working more effectively and reproducibly with data. These workshops are based on Software Carpentry: two-day, hands-on, bootcamp style workshops teaching best practices in software development, that have demonstrated the success of short workshops to teach foundational research skills. Data Carpentry focuses on data literacy in particular, with the objective of teaching skills to researchers to enable them to retrieve, view, manipulate, analyze and store their and other’s data in an open and reproducible way in order to extract knowledge from data.},
language = {en},
urldate = {2019-11-05},
journal = {International Journal of Digital Curation},
author = {Teal, Tracy K. and Cranston, Karen A. and Lapp, Hilmar and White, Ethan and Wilson, Greg and Ram, Karthik and Pawlik, Aleksandra},
month = feb,
year = {2015},
keywords = {curation, DCC, digital curation, digital preservation, IJDC, International Journal of Digital Curation, preservation},
pages = {135--143},
file = {Full Text PDF:/home/wmay/Zotero/storage/Q6M2Z9AA/Teal et al. - 2015 - Data Carpentry Workshops to Increase Data Literac.pdf:application/pdf;Snapshot:/home/wmay/Zotero/storage/62I238TH/10.1.html:text/html}
}
@article{mislan_elevating_2016,
title = {Elevating {The} {Status} of {Code} in {Ecology}},
volume = {31},
issn = {0169-5347},
url = {http://www.sciencedirect.com/science/article/pii/S0169534715002906},
doi = {10.1016/j.tree.2015.11.006},
abstract = {Code is increasingly central to ecological research but often remains unpublished and insufficiently recognized. Making code available allows analyses to be more easily reproduced and can facilitate research by other scientists. We evaluate journal handling of code, discuss barriers to its publication, and suggest approaches for promoting and archiving code.},
language = {en},
number = {1},
urldate = {2019-11-05},
journal = {Trends in Ecology \& Evolution},
author = {Mislan, K. A. S. and Heer, Jeffrey M. and White, Ethan P.},
month = jan,
year = {2016},
keywords = {data, publish, reproducibility, software},
pages = {4--7},
file = {ScienceDirect Full Text PDF:/home/wmay/Zotero/storage/PFD9T86E/Mislan et al. - 2016 - Elevating The Status of Code in Ecology.pdf:application/pdf;ScienceDirect Snapshot:/home/wmay/Zotero/storage/KCSA24VQ/S0169534715002906.html:text/html}
}
@article{hampton_skills_2017,
title = {Skills and {Knowledge} for {Data}-{Intensive} {Environmental} {Research}},
volume = {67},
issn = {0006-3568},
doi = {10.1093/biosci/bix025},
abstract = {The scale and magnitude of complex and pressing environmental issues lend urgency to the need for integrative and reproducible analysis and synthesis, facilitated by data-intensive research approaches. However, the recent pace of technological change has been such that appropriate skills to accomplish data-intensive research are lacking among environmental scientists, who more than ever need greater access to training and mentorship in computational skills. Here, we provide a roadmap for raising data competencies of current and next-generation environmental researchers by describing the concepts and skills needed for effectively engaging with the heterogeneous, distributed, and rapidly growing volumes of available data. We articulate five key skills: (1) data management and processing, (2) analysis, (3) software skills for science, (4) visualization, and (5) communication methods for collaboration and dissemination. We provide an overview of the current suite of training initiatives available to environmental scientists and models for closing the skill-transfer gap.},
language = {eng},
number = {6},
journal = {Bioscience},
author = {Hampton, Stephanie E. and Jones, Matthew B. and Wasser, Leah A. and Schildhauer, Mark P. and Supp, Sarah R. and Brun, Julien and Hernandez, Rebecca R. and Boettiger, Carl and Collins, Scott L. and Gross, Louis J. and Fernández, Denny S. and Budden, Amber and White, Ethan P. and Teal, Tracy K. and Labou, Stephanie G. and Aukema, Juliann E.},
month = jun,
year = {2017},
pmid = {28584342},
pmcid = {PMC5451289},
keywords = {computing, data management, ecology, informatics, workforce development},
pages = {546--557},
file = {Full Text:/home/wmay/Zotero/storage/SL8W5LEC/Hampton et al. - 2017 - Skills and Knowledge for Data-Intensive Environmen.pdf:application/pdf}
}
@article{ernest_portal_2018,
title = {The {Portal} {Project}: a long-term study of a {Chihuahuan} desert ecosystem},
copyright = {© 2018, Posted by Cold Spring Harbor Laboratory. The copyright holder has placed this preprint in the Public Domain. It is no longer restricted by copyright. Anyone can legally share, reuse, remix, or adapt this material for any purpose without crediting the original authors.},
shorttitle = {The {Portal} {Project}},
url = {https://www.biorxiv.org/content/10.1101/332783v2},
doi = {10.1101/332783},
abstract = {{\textless}h3{\textgreater}Abstract{\textless}/h3{\textgreater} {\textless}p{\textgreater}This is a data paper for the Portal Project, a long-term ecological study of rodents, plants, and ants located in southeastern Arizona, U.S.A. This paper contains an overview of methods and information about the structure of the data files and the relational structure among the files. This is a living data paper and will be updated with new information as major changes or additions are made to the data. All data - along with more detailed data collection protocols and site information - is archived at: https://doi.org/10.5281/zenodo.1215988.{\textless}/p{\textgreater}{\textless}h3{\textgreater}Background and Summary{\textless}/h3{\textgreater} {\textless}p{\textgreater}Long-term studies play a key role in ecology by providing unique and often foundational insights into how nature operates (Lindenmayer et al 2012; Hughes et al 2017a). Insights from long-term studies have advanced our understanding of the rapidity of species evolution (Boag and Grant 1981; Grant 1985; Arbogast et al 2006) and contributed to the development of ecological theories (Hubbell 2001) and the discovery of anthropogenic impacts on nature (Hughes et al 2017b). Despite the importance of long-term data for understanding how ecosystems and processes change over time, less than 9\% of studies in ecology use data collected for more than a decade (Estes et al 2018). In ecology, data collection for a typical study spans fewer than 3 years (Tilman 1989, Estes et al 2018). Without institutional support (e.g. NSF-funded Long-Term Ecological Research sites), long-term projects can be difficult to maintain, vulnerable to both the vagaries of funding and to the longevity and interest of the scientist running it. Because long-term data is difficult to collect, there is often resistance by its collectors to making it publicly available (Mills et al 2015). Thus long-term data is highly valuable but also less available than other types of data.{\textless}/p{\textgreater}{\textless}p{\textgreater}This data paper describes a publicly-available, long-term study of a Chihuahuan Desert Ecosystem near Portal, Arizona in the United States (aka the Portal Project). Started in 1977, the Portal Project encompasses over 40 years of ecological research, involving both short-term and long-term experiments and monitoring of a variety of different taxa (rodents, plants, and, for many decades, ants). These data have been used in over 100 scientific publications studying competition (e.g., Munger and Brown 1981), granivory (e.g., Chen and Valone 2017), community dynamics (e.g., Ernest et al 2008), and the long-term reorganization of the ecosystem in response to habitat conversion (e.g., Brown et al 1995). Data can be downloaded from Zenodo (https://doi.org/10.5281/zenodo.1215988). The goal of this data paper is to provide an overview of the study, our available data and its structure, and the general data collection and data entry/quality assurance/quality control processes for the different data types. Detailed protocols for data collection and curation are in the metadata associated with the archived data.{\textless}/p{\textgreater}},
language = {en},
urldate = {2019-11-07},
journal = {bioRxiv},
author = {Ernest, S. K. Morgan and Yenni, Glenda M. and Allington, Ginger and Bledsoe, Ellen K. and Christensen, Erica M. and Diaz, Renata M. and Geluso, Keith and Goheen, Jacob R. and Guo, Qinfeng and Heske, Edward and Kelt, Douglas and Meiners, Joan M. and Munger, Jim and Restrepo, Carla and Samson, Douglas A. and Schutzenhofer, Michele R. and Skupski, Marian and Supp, Sarah R. and Thibault, Kate and Taylor, Shawn and White, Ethan and Davidson, Diane W. and Brown, James H. and Valone, Thomas J.},
month = nov,
year = {2018},
pages = {332783},
file = {Full Text PDF:/home/wmay/Zotero/storage/HD9NNUZU/Ernest et al. - 2018 - The Portal Project a long-term study of a Chihuah.pdf:application/pdf;Snapshot:/home/wmay/Zotero/storage/QMPNKCMW/332783v2.html:text/html}
}
---
title: "Developing a modern data workflow for regularly updated data (Portal Project)"
subtitle: 'Source: [Gist](https://gist.github.com/wmay/4d070445f93025571c7032e8a1bd0572)'
author: "William May"
date: "`r gsub(' 0', ' ', format(Sys.Date(), '%B %d, %Y'))`"
output:
ioslides_presentation:
widescreen: true
bibliography: data_workflow.bib
---
<style>
/* 2-column formatting */
.columns-2 h3 {
text-decoration: underline;
}
.columns-2 {
height: 76%;
-webkit-column-gap: 10%;
-moz-column-gap: 10%;
column-gap: 10%;
-webkit-column-rule: 4px solid #515151;
-moz-column-rule: 4px solid #515151;
column-rule: 4px solid #515151;
}
.forceBreak {
-webkit-column-break-after: always;
break-after: column;
}
/* Definition lists */
dt {
font-weight: bold;
margin-top: 30px;
margin-bottom: 10px;
}
dt::after {
content: ":";
}
dd {
margin-left: 20px;
}
/* fix reference formatting by allowing scrolling */
.references {
height: 500px;
overflow: scroll;
}
</style>
```{r setup, include=FALSE}
library(fontawesome)
knitr::opts_chunk$set(echo = FALSE)
```
# Introduction
## The Portal Project
- "a small group of researchers managing an ongoing, long-term research project"
- "automated and manual data collection efforts at daily to annual frequencies conducted over 40 years"
- "a regularly changing group of personnel"
- part of the interdisciplinary Weecology group that publishes papers about scientific computing and data management [@wilson_best_2014; @teal_data_2015; @hampton_skills_2017; @yenni_developing_2019]
- [portal.weecology.org](https://portal.weecology.org/), [www.weecology.org](https://www.weecology.org/)
## Problem
We have continuously updated data. How do we
- efficiently process the data?
- support reproducible research, even though our data is changing?
- credit contributors for their work?
## The Portal Project's Answers
- Excel data validation
- Software tools:
- automation (R, Travis CI)
- unit tests (R `testthat` package)
- version control (git, Github)
- Data repository (DOIs, Zenodo)
- Data paper preprint (BioRxiv)
# Tools
## Unit tests
Code that tests code. Example:
```{r unittest, echo=TRUE, error=TRUE}
library(testthat)
one_plus_one = function() {
return(22)
}
expect_equal(one_plus_one(), 2) # <- this is a unit test
```
## `r fa('git-alt', fill='#f14e32')` Git (version control)
Saves and organizes code versions ("commits"). Versions are grouped into *branches* that can be edited independently.
```{r branches, fig.cap="A common branching strategy. From *Pro Git* [@chacon_pro_2014]"}
knitr::include_graphics('git_branches.png')
```
## `r fa('github')` Github and <img src="TravisCI-Mascot-1.png" alt="Travis CI logo" width="40px" height="40px"> Travis CI
Github:
- popular website that hosts git repositories
- allows others to view and contribute to your repository
- connects with many other software tools
Travis CI:
- run tasks in response to events on Github
- often used to run unit tests and build packages
- free for open source projects
## DOIs and Zenodo
Digital Object Identifier (DOI):
- permanent link to an object -- very important for reproducible research!
- URLs are not considered permanent
- According to NCAR, DOIs are "Widely accepted by publishing community and becoming a best practice for scientific publications that have facts based on data" ([source](https://rda.ucar.edu/#!data-citation))
Zenodo:
- DOI-issuing repository
- operated by CERN
- connects with many research and code tools, such as ORCID, Github, etc.
# Workflow
## Portal Project workflow
```{r overview, out.width='100%', fig.cap="@yenni_developing_2019"}
knitr::include_graphics('overview.PNG')
```
## Raw data collection {.columns-2}
### Portal Project
- Travis CI runs R scripts that download raw data
- Field-collected data entered manually
<p class="forceBreak"></p>
### ASRC
- Linux cron job runs R script that downloads raw data
- Some data collected via email
- Calibration results, some data flags and supporting information entered manually
To do:
- Automate remaining raw data collection
## Data Entry QA {.columns-2}
### Portal Project
- Excel data validation rules
- Comparison of two transcribed data files
- Unit tests for data
- New data reviewed by data manager
<p class="forceBreak"></p>
### ASRC
- Postgres table constraints:
- "unique" constraints
- foreign keys
- time-related constraints
- Data flags and calibration files reviewed by Jim Schwab
To do:
- Automate tests for calibration files
## Data Processing {.columns-2}
### Portal Project
- Data processing is fully automated
- Unit tests for processed data
<p class="forceBreak"></p>
### ASRC
- Data processing is mostly automated
- Unit tests for processing code
- Some data flags entered manually (not easily automatable)
To do:
- Unit tests for processed data
## Data Versioning {.columns-2}
### Portal Project
- All data and code in a Github repository ([link](https://github.com/weecology/PortalData))
- Travis CI creates a Github release when new data is added
<p class="forceBreak"></p>
### ASRC
- Only processing code on Github (data too large for Github) ([link](https://github.com/ASRCsoft/nysatmoschem))
- Manually create a Github release when new files are released
To do:
- Travis CI builds dataset and creates Github release?
## Archiving {.columns-2}
### Portal Project
- Github repository archived on Zenodo ([DOI](https://doi.org/10.5281/zenodo.1215988))
- Zenodo adds new version to the archive in response to Github releases
<p class="forceBreak"></p>
### ASRC
- Processed data files archived on atmoschem server ([link](http://atmoschem.asrc.cestm.albany.edu/~aqm/AQM_Products/))
- Manually add new files to the atmoschem server and send announcement email
To do:
- Archive data on a DOI-issuing repository
- Automate archive updates
## Citation {.columns-2}
### Portal Project
- Periodically updated paper preprint on BioRxiv [@ernest_portal_2018]
<p class="forceBreak"></p>
### ASRC
To do:
- ???
Are paper citations better than data citations?
## Complications
Bigger data
~ Github doesn't hold our raw data. So what do we do with it? (Archive it separately I suppose.)
Complex processing
~ Too many moving parts to keep organized in one repository or package. Where to put the dataset-creating code?
Multiple datasets
~ How should we organize and automate multiple, closely-related datasets?
# End
## References {.smaller}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment