Vince Buffalo vsbuffalo

## factors-memory.R
# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte
# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.:

a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE)
object.size(a)
# 800001192 bytes
object.size(factor(a))
# 400001744 bytes

# For long character vectors of repeating values, this *really* pays off.

## tweets.R
library(ggplot2)
library(lubridate)
library(dplyr)
library(reshape2)
myname <- "@vsbuffalo" # for removing later

d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE)

extractMentions <- function(x) {
    gsub("[^@]*(@[a-zA-Z0-9_]+).*", "\\1", x, perl=TRUE)

## bds-toc.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                vsbuffalo
                / bds-toc.md
            
            
              Last active
              August 29, 2015 13:58
            
              
                Bioinformatics Data Skills ToC
              
          
    Bioinformatics Data Skills Table of Contents

This may change due to length considerations. Parts in bold are available for early release from O'Reilly.
Part I. Ideology: Data Skills, Robust and Reproducible Bioinformatics


How to Learn Robust and Reproducible Bioinformatics

Part II. Prerequisites: Setting up a Project, Working with Unix, Version Control, and Data


## summarizeByTile.R
library(GenomicRanges)

summarizeByTile <-
# given a GRanges (or some sort of ranged data) object `x`, and a
# *corresponding* vector values to summarize `y` (these *must*
# correspond), calculate the summary per tile with the function `fun`.
# Note: this is still beta; wider tests coming, use with caution.
function(x, y, tiles, fun, mcol_name="y") {
    stopifnot(length(x) == length(y))

## entropy_class.py
from __future__ import division
from collections import Counter
from math import log

def entropy(seq, unit="bit"):
    """
    Returns entropy of DNA sequence.

    The entropy formula is:
    entropy = -sum_i (log(p_i) * p_i)

## entropy_vince.py
"""
entropy.py

Calculate entropy of a given list.
"""
from math import log, log10
from collections import Counter
import pdb

def entropy(x, logfun=lambda x: log(x, 2)):

## naive_nshared.py
import sys
from readfq import readfq
from itertools import combinations
from datetime import datetime

def num_shared(seq_a, seq_b, consensus_seq):
    """
    Given two alignment sequences in multiple alignment FASTA format,
    calculate the number of shared SNPs (for minor alleles only, not
    in consensus).

## repeat_mut_sims.py
import numpy as np
from itertools import combinations
from collections import Counter
import datetime as dt

np.random.seed(0)

def repeat_mutation_sim(G, N, L, mu=3e-8):
    """
    Generate N repeats of length L mutating at rate

## .tmux
# use GNU screen's C-a binding, since it's programmed in my brain
set-option -g prefix C-a
unbind C-b

# use GNU screen's C-a C-a for last window
bind-key C-a last-window

# use 1-based indexing, since 1 is close
set -g base-index 1

## trim.sh
#!/bin/bash
# trim.sh - generic, slightly insane paired end quality trimming script
# Vince Buffalo <vsbuffaloAAAAAA@gmail.com> (sans poly-A)
set -e
set -u

## pre-config
ADAPTERS=illumina_adapters.fa
SAMPLE_NAME=some_sample_name
IN1=in1.fastq
	# Factors are more memory efficient (if labels > few bytes), since redundant multi-byte
	# labels are stored once in memory (as attributes), and integers keep the mapping. E.g.:

	a = sample(paste0("chrom", c(1:22, "X", "Y")), 1e8, replace=TRUE)
	object.size(a)
	# 800001192 bytes
	object.size(factor(a))
	# 400001744 bytes

	# For long character vectors of repeating values, this really pays off.
	library(ggplot2)
	library(lubridate)
	library(dplyr)
	library(reshape2)
	myname <- "@vsbuffalo" # for removing later

	d <- read.csv("tweets.csv", header=TRUE, stringsAsFactors=FALSE)

	extractMentions <- function(x) {
	gsub("[^@](@[a-zA-Z0-9_]+).", "\\1", x, perl=TRUE)
	library(GenomicRanges)

	summarizeByTile <-
	# given a GRanges (or some sort of ranged data) object `x`, and a
	# corresponding vector values to summarize `y` (these must
	# correspond), calculate the summary per tile with the function `fun`.
	# Note: this is still beta; wider tests coming, use with caution.
	function(x, y, tiles, fun, mcol_name="y") {
	stopifnot(length(x) == length(y))
	from __future__ import division
	from collections import Counter
	from math import log

	def entropy(seq, unit="bit"):
	"""
	Returns entropy of DNA sequence.

	The entropy formula is:
	entropy = -sum_i (log(p_i) * p_i)
	"""
	entropy.py

	Calculate entropy of a given list.
	"""
	from math import log, log10
	from collections import Counter
	import pdb

	def entropy(x, logfun=lambda x: log(x, 2)):
	import sys
	from readfq import readfq
	from itertools import combinations
	from datetime import datetime

	def num_shared(seq_a, seq_b, consensus_seq):
	"""
	Given two alignment sequences in multiple alignment FASTA format,
	calculate the number of shared SNPs (for minor alleles only, not
	in consensus).
	import numpy as np
	from itertools import combinations
	from collections import Counter
	import datetime as dt

	np.random.seed(0)

	def repeat_mutation_sim(G, N, L, mu=3e-8):
	"""
	Generate N repeats of length L mutating at rate
	# use GNU screen's C-a binding, since it's programmed in my brain
	set-option -g prefix C-a
	unbind C-b

	# use GNU screen's C-a C-a for last window
	bind-key C-a last-window

	# use 1-based indexing, since 1 is close
	set -g base-index 1
	#!/bin/bash
	# trim.sh - generic, slightly insane paired end quality trimming script
	# Vince Buffalo <vsbuffaloAAAAAA@gmail.com> (sans poly-A)
	set -e
	set -u

	## pre-config
	ADAPTERS=illumina_adapters.fa
	SAMPLE_NAME=some_sample_name
	IN1=in1.fastq