Vince Buffalo vsbuffalo

## bds-toc.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                vsbuffalo
                / bds-toc.md
            
            
              Last active
              August 29, 2015 13:58
            
              
                Bioinformatics Data Skills ToC
              
          
    Bioinformatics Data Skills Table of Contents

This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
Part I. Ideology: Data Skills, Robust and Reproducible Bioinformatics


How to Learn Robust and Reproducible Bioinformatics

Part II. Prerequisites: Setting up a Project, Working with Unix, Version Control, and Data


## tweets.R
library(ggplot2)
library(lubridate)
library(dplyr)
library(reshape2)
myname <- "@vsbuffalo" # for removing later

d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE)

extractMentions <- function(x) {
    gsub("[^@]*(@[a-zA-Z0-9_]+).*", "\\1", x, perl=TRUE)

## factors-memory.R
# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte
# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.:

a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE)
object.size(a)
# 800001192 bytes
object.size(factor(a))
# 400001744 bytes

# For long character vectors of repeating values, this *really* pays off.

## ex.R
# Rather trivial example of how dataframe row indexing could be used.
# I would not use this method, but it's not terrible design — access
# is fast compared to say genotype == 1L. Why limit these cases

> genotypes <- sample(0:2, 10, replace=TRUE)
> freq <- 0.3
> df <- data.frame(name=c("aa", "Aa", "AA"), type=c("homozygote", "heterozygote", "homozygote"),
        freq=c((1-freq)^2, 2*freq*(1-freq), freq^2))
> df
  name         type freq

## missing.R
# A quick function to save a PBM (http://en.wikipedia.org/wiki/Netpbm_format)
# visualize *a lot* of missing data pretty quickly (outside of R).

writeMissingPBM <- function(x, file) {
  dims <- dim(x)
  x[] <- as.integer(is.na(x))
  con <- file(file, open="wt")
  writeLines(sprintf("P1\n%d %d", ncol(x), nrow(x)), con)
  write.table(x, file=con, sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE)
  close(con)

## example.bam
@HD	VN:1.0	GO:none	SO:coordinate
@SQ	SN:1	LN:249250621	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:1b22b98cdeb4a9304cb5d48026a85128
@SQ	SN:2	LN:243199373	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:a0d9851da00400dec1098a9255ac712e
@SQ	SN:3	LN:198022430	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:fdfd811849cc2fadebc929bb925902e5
@SQ	SN:4	LN:191154276	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:23dccd106897542ad87d2765d28a19a1
@SQ	SN:5	LN:180915260	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:0740173db9ffd264d728f32784845cd7
@SQ	SN:6	LN:171115067	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:1d3a93a248d92a729ee764823acbbc6b
@SQ	SN:7	LN:159138663	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:618366e95

## example.sam
@HD	VN:1.0	GO:none	SO:coordinate
@SQ	SN:1	LN:249250621	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:1b22b98cdeb4a9304cb5d48026a85128
@SQ	SN:2	LN:243199373	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:a0d9851da00400dec1098a9255ac712e
@SQ	SN:3	LN:198022430	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:fdfd811849cc2fadebc929bb925902e5
@SQ	SN:4	LN:191154276	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:23dccd106897542ad87d2765d28a19a1
@SQ	SN:5	LN:180915260	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:0740173db9ffd264d728f32784845cd7
@SQ	SN:6	LN:171115067	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:1d3a93a248d92a729ee764823acbbc6b
@SQ	SN:7	LN:159138663	AS:NCBI37	UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta	M5:618366e95

## alnstat.py
import sys
import pysam
from collections import Counter

if len(sys.argv) < 2:
    sys.exit("usage: alnstat.py in.bam")

fname = sys.argv[1]
mode = "rb" if fname.endswith(".bam") else "r"
bamfile = pysam.AlignmentFile(fname, mode)

## parseMS.R


parseMS <- function(file) {
   # simple MS parser
   lns <- readLines(file)
   header <- lns[1:3]
   body <- lns[-c(1:3)]
   brks <- body == "//"
   groups <- cumsum(brks)
   sims <- split(body, groups)

## hpdi_stat.R
hpdi_stat <- function(x, prob=0.95) {
   # stat functions for ggplot that handle HPDI from samples
   # automatically.
   hpdi <- HPDinterval(as.mcmc(x))
   c(y=mean(x), ymin=hpdi[1, 'lower'], ymax=hpdi[1, 'upper'])
 }
	library(ggplot2)
	library(lubridate)
	library(dplyr)
	library(reshape2)
	myname <- "@vsbuffalo" # for removing later

	d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE)

	extractMentions <- function(x) {
	gsub("[^@](@[a-zA-Z0-9_]+).", "\\1", x, perl=TRUE)
	# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte
	# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.:

	a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE)
	object.size(a)
	# 800001192 bytes
	object.size(factor(a))
	# 400001744 bytes

	# For long character vectors of repeating values, this really pays off.
	# Rather trivial example of how dataframe row indexing could be used.
	# I would not use this method, but it's not terrible design — access
	# is fast compared to say genotype == 1L. Why limit these cases

	> genotypes <- sample(0:2, 10, replace=TRUE)
	> freq <- 0.3
	> df <- data.frame(name=c("aa", "Aa", "AA"), type=c("homozygote", "heterozygote", "homozygote"),
	freq=c((1-freq)^2, 2freq(1-freq), freq^2))
	> df
	name type freq
	# A quick function to save a PBM (http://en.wikipedia.org/wiki/Netpbm_format)
	# visualize a lot of missing data pretty quickly (outside of R).

	writeMissingPBM <- function(x, file) {
	dims <- dim(x)
	x[] <- as.integer(is.na(x))
	con <- file(file, open="wt")
	writeLines(sprintf("P1\n%d %d", ncol(x), nrow(x)), con)
	write.table(x, file=con, sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE)
	close(con)
	@HD VN:1.0 GO:none SO:coordinate
	@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128
	@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e
	@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5
	@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1
	@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7
	@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b
	@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95
	import sys
	import pysam
	from collections import Counter

	if len(sys.argv) < 2:
	sys.exit("usage: alnstat.py in.bam")

	fname = sys.argv[1]
	mode = "rb" if fname.endswith(".bam") else "r"
	bamfile = pysam.AlignmentFile(fname, mode)


	parseMS <- function(file) {
	# simple MS parser
	lns <- readLines(file)
	header <- lns[1:3]
	body <- lns[-c(1:3)]
	brks <- body == "//"
	groups <- cumsum(brks)
	sims <- split(body, groups)
	hpdi_stat <- function(x, prob=0.95) {
	# stat functions for ggplot that handle HPDI from samples
	# automatically.
	hpdi <- HPDinterval(as.mcmc(x))
	c(y=mean(x), ymin=hpdi[1, 'lower'], ymax=hpdi[1, 'upper'])
	}