This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
- How to Learn Robust and Reproducible Bioinformatics
This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
library(ggplot2) | |
library(lubridate) | |
library(dplyr) | |
library(reshape2) | |
myname <- "@vsbuffalo" # for removing later | |
d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE) | |
extractMentions <- function(x) { | |
gsub("[^@]*(@[a-zA-Z0-9_]+).*", "\\1", x, perl=TRUE) |
# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte | |
# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.: | |
a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE) | |
object.size(a) | |
# 800001192 bytes | |
object.size(factor(a)) | |
# 400001744 bytes | |
# For long character vectors of repeating values, this *really* pays off. |
# Rather trivial example of how dataframe row indexing could be used. | |
# I would not use this method, but it's not terrible design — access | |
# is fast compared to say genotype == 1L. Why limit these cases | |
> genotypes <- sample(0:2, 10, replace=TRUE) | |
> freq <- 0.3 | |
> df <- data.frame(name=c("aa", "Aa", "AA"), type=c("homozygote", "heterozygote", "homozygote"), | |
freq=c((1-freq)^2, 2*freq*(1-freq), freq^2)) | |
> df | |
name type freq |
# A quick function to save a PBM (http://en.wikipedia.org/wiki/Netpbm_format) | |
# visualize *a lot* of missing data pretty quickly (outside of R). | |
writeMissingPBM <- function(x, file) { | |
dims <- dim(x) | |
x[] <- as.integer(is.na(x)) | |
con <- file(file, open="wt") | |
writeLines(sprintf("P1\n%d %d", ncol(x), nrow(x)), con) | |
write.table(x, file=con, sep=" ", col.names=FALSE, row.names=FALSE, quote=FALSE) | |
close(con) |
@HD VN:1.0 GO:none SO:coordinate | |
@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 | |
@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e | |
@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5 | |
@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1 | |
@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7 | |
@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b | |
@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95 |
@HD VN:1.0 GO:none SO:coordinate | |
@SQ SN:1 LN:249250621 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 | |
@SQ SN:2 LN:243199373 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:a0d9851da00400dec1098a9255ac712e | |
@SQ SN:3 LN:198022430 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:fdfd811849cc2fadebc929bb925902e5 | |
@SQ SN:4 LN:191154276 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:23dccd106897542ad87d2765d28a19a1 | |
@SQ SN:5 LN:180915260 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:0740173db9ffd264d728f32784845cd7 | |
@SQ SN:6 LN:171115067 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:1d3a93a248d92a729ee764823acbbc6b | |
@SQ SN:7 LN:159138663 AS:NCBI37 UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta M5:618366e95 |
import sys | |
import pysam | |
from collections import Counter | |
if len(sys.argv) < 2: | |
sys.exit("usage: alnstat.py in.bam") | |
fname = sys.argv[1] | |
mode = "rb" if fname.endswith(".bam") else "r" | |
bamfile = pysam.AlignmentFile(fname, mode) |
parseMS <- function(file) { | |
# simple MS parser | |
lns <- readLines(file) | |
header <- lns[1:3] | |
body <- lns[-c(1:3)] | |
brks <- body == "//" | |
groups <- cumsum(brks) | |
sims <- split(body, groups) |
hpdi_stat <- function(x, prob=0.95) { | |
# stat functions for ggplot that handle HPDI from samples | |
# automatically. | |
hpdi <- HPDinterval(as.mcmc(x)) | |
c(y=mean(x), ymin=hpdi[1, 'lower'], ymax=hpdi[1, 'upper']) | |
} |