Thiago Britto Borges tbrittoborges

## bioinfo_bits.py
#
import operator
sequence = "ACGACTGATCGATCGATCGATGCATCGATCGACGAT"
random_positions = random.sample(xrange(len(sequence)), 30)
get_positions = operator.itemgetter(*random_positions)
get_positions(sequence)
('T', 'C', 'G', 'C', 'A', 'C', 'C', 'T', 'A', 'T', 'G', 'T', 'A', 'T', 'C', 'C', 'T', 'T', 'A', 'G', 'T', 'A', 'A', 'A', 'C', 'G', 'G', 'C', 'G', 'A')

from itertools import groupby

## pd_latex_table.py
def better_table(table, caption, name):

    start = r"""
\begin{{table}}[!htb]
\sisetup{{round-mode=places, round-precision=2}}
\caption{{{}}}\label{{table:{}}}
\centering
""".format(caption, name)

    end = r"\end{table}"

## unlistfy.py
df['new'] = df['new'].str.split('/') # example how to listfy a column of strings
temp = pd.DataFrame(df['new'].dropna().tolist())

temp = temp.stack()
temp.index = temp.index.droplevel(1) # index need to be coherent with the original dataframe
temp.name = 'new_colum' # name of the new column in the original dataframe

df = df.join(temp)

## example_flowdiagram.tex
\documentclass{article}
\usepackage{tikz}
\usepackage{array}
\usepackage{siunitx}

\usetikzlibrary{shapes.geometric, shapes.misc, arrows, fit, calc}

\newcommand\addvmargin[1]{
  \node[fit=(current bounding box),inner ysep=#1,inner xsep=0]{};
}

## gist:f3a58425f5f5d5fbab747af5dc364d83
# run this in you bash command line
# list all r3 packages installed with conda:
conda list | grep r3 | awk '{print $1}')

# remove all pakages r3
for i in $(conda list | grep r3 | awk '{print $1}'); do conda remove -y $i; done

# finally, remove R
conda remove r-essentials

## pandas_reverse_complement.py
def reverse_complement(sequence):
     tab = str.maketrans("ACGT", "TGCA")
     return sequence.translate(tab)[::-1]

def apply_rc(row):
     if row['strand'] == '-':
         row['seq'] = reverse_complement(row['seq'])

     return row


## Junction_type_classification.py
def junction_type2(row):
    """Junction type classification"""
    # if there is no exons supported by realible junctions
    # return a interable with empty strings
    if row['exons_w_junct_sup'] is None:
        return ['', '']

    type_ = []
    # each row is a junction
    j_start, j_end, exons, strand = row.loc[[

## dorina_example.py
def analyse(genome, set_a, match_a='any', region_a='any',
            set_b=None, match_b='any', region_b='any',
            combine='or', genes=None, window_a=-1, window_b=-1,
            datadir=None):

# It takes the name of the genome assembly to use, and at least a list of set A regulator names.
# A simple analysis run with a custom regulator would be:

from dorina.run import analyse
results = analyse('hg19', ['/path/to/custom/regulator.bed', 'PARCLIP_PUM2_hg19'])

## mean_sd_read_length.sh

for f in raw_reads{39..50}.fq.gz
do
	echo  "$f   "
	gzip -cd $f | awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-m*m);}'
	done


## read_fasta_from_str.py
def read_fasta_from_str(fasta):
    """

    :param str fasta: multiple sequences in fasta string
    """
    from itertools import groupby

    def is_header(line):
        return line.startswith(">")
	#
	import operator
	sequence = "ACGACTGATCGATCGATCGATGCATCGATCGACGAT"
	random_positions = random.sample(xrange(len(sequence)), 30)
	get_positions = operator.itemgetter(*random_positions)
	get_positions(sequence)
	('T', 'C', 'G', 'C', 'A', 'C', 'C', 'T', 'A', 'T', 'G', 'T', 'A', 'T', 'C', 'C', 'T', 'T', 'A', 'G', 'T', 'A', 'A', 'A', 'C', 'G', 'G', 'C', 'G', 'A')

	from itertools import groupby
	def better_table(table, caption, name):

	start = r"""
	\begin{{table}}[!htb]
	\sisetup{{round-mode=places, round-precision=2}}
	\caption{{{}}}\label{{table:{}}}
	\centering
	""".format(caption, name)

	end = r"\end{table}"
	df['new'] = df['new'].str.split('/') # example how to listfy a column of strings
	temp = pd.DataFrame(df['new'].dropna().tolist())

	temp = temp.stack()
	temp.index = temp.index.droplevel(1) # index need to be coherent with the original dataframe
	temp.name = 'new_colum' # name of the new column in the original dataframe

	df = df.join(temp)
	\documentclass{article}
	\usepackage{tikz}
	\usepackage{array}
	\usepackage{siunitx}

	\usetikzlibrary{shapes.geometric, shapes.misc, arrows, fit, calc}

	\newcommand\addvmargin[1]{
	\node[fit=(current bounding box),inner ysep=#1,inner xsep=0]{};
	}
	# run this in you bash command line
	# list all r3 packages installed with conda:
	conda list \| grep r3 \| awk '{print $1}')

	# remove all pakages r3
	for i in $(conda list \| grep r3 \| awk '{print $1}'); do conda remove -y $i; done

	# finally, remove R
	conda remove r-essentials
	def reverse_complement(sequence):
	tab = str.maketrans("ACGT", "TGCA")
	return sequence.translate(tab)[::-1]

	def apply_rc(row):
	if row['strand'] == '-':
	row['seq'] = reverse_complement(row['seq'])

	return row
	def junction_type2(row):
	"""Junction type classification"""
	# if there is no exons supported by realible junctions
	# return a interable with empty strings
	if row['exons_w_junct_sup'] is None:
	return ['', '']

	type_ = []
	# each row is a junction
	j_start, j_end, exons, strand = row.loc[[
	def analyse(genome, set_a, match_a='any', region_a='any',
	set_b=None, match_b='any', region_b='any',
	combine='or', genes=None, window_a=-1, window_b=-1,
	datadir=None):

	# It takes the name of the genome assembly to use, and at least a list of set A regulator names.
	# A simple analysis run with a custom regulator would be:

	from dorina.run import analyse
	results = analyse('hg19', ['/path/to/custom/regulator.bed', 'PARCLIP_PUM2_hg19'])

	for f in raw_reads{39..50}.fq.gz
	do
	echo "$f "
	gzip -cd $f \| awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=LL;}END{m=t/n;printf("total %d avg=%f stddev=%f\n",n,m,sq/n-mm);}'
	done
	def read_fasta_from_str(fasta):
	"""

	:param str fasta: multiple sequences in fasta string
	"""
	from itertools import groupby

	def is_header(line):
	return line.startswith(">")