This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
- How to Learn Robust and Reproducible Bioinformatics
# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte | |
# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.: | |
a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE) | |
object.size(a) | |
# 800001192 bytes | |
object.size(factor(a)) | |
# 400001744 bytes | |
# For long character vectors of repeating values, this *really* pays off. |
library(ggplot2) | |
library(lubridate) | |
library(dplyr) | |
library(reshape2) | |
myname <- "@vsbuffalo" # for removing later | |
d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE) | |
extractMentions <- function(x) { | |
gsub("[^@]*(@[a-zA-Z0-9_]+).*", "\\1", x, perl=TRUE) |
This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
library(GenomicRanges) | |
summarizeByTile <- | |
# given a GRanges (or some sort of ranged data) object `x`, and a | |
# *corresponding* vector values to summarize `y` (these *must* | |
# correspond), calculate the summary per tile with the function `fun`. | |
# Note: this is still beta; wider tests coming, use with caution. | |
function(x, y, tiles, fun, mcol_name="y") { | |
stopifnot(length(x) == length(y)) |
from __future__ import division | |
from collections import Counter | |
from math import log | |
def entropy(seq, unit="bit"): | |
""" | |
Returns entropy of DNA sequence. | |
The entropy formula is: | |
entropy = -sum_i (log(p_i) * p_i) |
""" | |
entropy.py | |
Calculate entropy of a given list. | |
""" | |
from math import log, log10 | |
from collections import Counter | |
import pdb | |
def entropy(x, logfun=lambda x: log(x, 2)): |
import numpy as np | |
from itertools import combinations | |
from collections import Counter | |
import datetime as dt | |
np.random.seed(0) | |
def repeat_mutation_sim(G, N, L, mu=3e-8): | |
""" | |
Generate N repeats of length L mutating at rate |
# use GNU screen's C-a binding, since it's programmed in my brain | |
set-option -g prefix C-a | |
unbind C-b | |
# use GNU screen's C-a C-a for last window | |
bind-key C-a last-window | |
# use 1-based indexing, since 1 is close | |
set -g base-index 1 |
#!/bin/bash | |
# trim.sh - generic, slightly insane paired end quality trimming script | |
# Vince Buffalo <vsbuffaloAAAAAA@gmail.com> (sans poly-A) | |
set -e | |
set -u | |
## pre-config | |
ADAPTERS=illumina_adapters.fa | |
SAMPLE_NAME=some_sample_name | |
IN1=in1.fastq |