Skip to content

Instantly share code, notes, and snippets.

View vsbuffalo's full-sized avatar

Vince Buffalo vsbuffalo

View GitHub Profile
@vsbuffalo
vsbuffalo / bds-toc.md
Last active August 29, 2015 13:58
Bioinformatics Data Skills ToC

Bioinformatics Data Skills Table of Contents

This may change due to length considerations. Parts in bold are available for early release from O'Reilly.

Part I. Ideology: Data Skills, Robust and Reproducible Bioinformatics

  • How to Learn Robust and Reproducible Bioinformatics

Part II. Prerequisites: Setting up a Project, Working with Unix, Version Control, and Data

@vsbuffalo
vsbuffalo / tweets.R
Created May 22, 2014 20:30
Visualize your mentions over time
library(ggplot2)
library(lubridate)
library(dplyr)
library(reshape2)
myname <- "@vsbuffalo" # for removing later
d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE)
extractMentions <- function(x) {
gsub("[^@]*(@[a-zA-Z0-9_]+).*", "\\1", x, perl=TRUE)
# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte
# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.:
a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE)
object.size(a)
# 800001192 bytes
object.size(factor(a))
# 400001744 bytes
# For long character vectors of repeating values, this *really* pays off.
@vsbuffalo
vsbuffalo / ex.R
Last active August 29, 2015 14:05
# Rather trivial example of how dataframe row indexing could be used.
# I would not use this method, but it's not terrible design — access
# is fast compared to say genotype == 1L. Why limit these cases
> genotypes <- sample(0:2, 10, replace=TRUE)
> freq <- 0.3
> df <- data.frame(name=c("aa", "Aa", "AA"), type=c("homozygote", "heterozygote", "homozygote"),
freq=c((1-freq)^2, 2*freq*(1-freq), freq^2))
> df
name type freq
# A quick function to save a PBM (http://en.wikipedia.org/wiki/Netpbm_format)
# visualize *a lot* of missing data pretty quickly (outside of R).
writeMissingPBM <- function(x, file) {
dims <- dim(x)
x[] <- as.integer(is.na(x))
con <- file(file, open="wt")
writeLines(sprintf("P1\n%d %d", ncol(x), nrow(x)), con)
write.table(x, file=con, sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE)
close(con)
@HD VN:1.0 GO:none SO:coordinate
@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128
@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e
@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5
@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1
@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7
@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b
@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95
@HD VN:1.0 GO:none SO:coordinate
@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128
@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e
@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5
@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1
@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7
@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b
@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95
import sys
import pysam
from collections import Counter
if len(sys.argv) < 2:
sys.exit("usage: alnstat.py in.bam")
fname = sys.argv[1]
mode = "rb" if fname.endswith(".bam") else "r"
bamfile = pysam.AlignmentFile(fname, mode)
parseMS <- function(file) {
# simple MS parser
lns <- readLines(file)
header <- lns[1:3]
body <- lns[-c(1:3)]
brks <- body == "//"
groups <- cumsum(brks)
sims <- split(body, groups)
hpdi_stat <- function(x, prob=0.95) {
# stat functions for ggplot that handle HPDI from samples
# automatically.
hpdi <- HPDinterval(as.mcmc(x))
c(y=mean(x), ymin=hpdi[1, 'lower'], ymax=hpdi[1, 'upper'])
}