Jan Oppelt opplatek

## deeptools-heatmap-fast.sh
#!/bin/bash
#
# Speed up deepTools computeMatrix by splitting the references into smaller chunks and then merging the matrices together
#

positions=5000
threads=12

rnd=$RANDOM

## automate_revigo.py
#!/usr/bin/python3
#
# Python script for programtic access to Revigo. Run it with (last output file name is optional):
# python3 revigo.py example.csv 9606
# http://revigo.irb.hr/FAQ.aspx -> How do I integrate Revigo with my service or a programming language? -> The advanced job submitting method
#
# Make revigo python virtual env.
#python3 -m venv revigo
#source revigo/bin/activate
#pip install --upgrade pip

## split_vector.R
#
# Splitting vector into smaller chunks by a separator and extracting chunk(s)
#

split_vector <- function (x, sep, column){
#  sapply(x, function(x) strsplit(x, split = sep, fixed = T)[[1]][column], USE.NAMES = F)
  unlist(lapply(x, function(x) strsplit(x, sep, fixed = T)[[1]][column]))
}

# Test

## fastq_names_seqs.sh
# Get all the sequences from fastq.gz
zcat in.fastq.gz | awk 'NR == 2 || NR % 4 == 2'

# Get all the sequence names from fastq.gz
zcat in.fastq.gz | awk 'NR == 1 || NR % 4 == 1'

## get_names_sam.sh
#!/bin/bash
#
# Quickly extract unique read names from SAM/BAM file
# Source: https://www.biostars.org/p/371705/#371748
#
# This will extract unique read names for all unampped reads (-f 4) from BAM file
# awk is much faster than `sort --parallel=$threads | uniq` because it doesn't have to do the sorting
# On BAM file with 149,909,118 input reads:
# sort | uniq takes (with 12 threads):
#real    1m8.240s

## stdin_stdout.py
#!/usr/bin/python3

import sys

# If intab (input) is defined, read it; if not assume stdin
if intab:
    f = open(intab, "r")
else:
    f = sys.stdin

## sed_replace_multiple.sh
#!/bin/bash
#
# Replace one or more spaces (characters) with sed
# Important: there are two spaces before the * character
# Note: If you don't quote $text, echo will strip multiple spaces to just one
#

text="hello     world"
echo "$text" | sed 's/  */X/g'
#helloXworld

## reformat_rna_ref.sh
#!/bin/bash
#
# Get miRBase hairpin FASTA (multiline, with U instead of T) and reformat it to "normal" reference FASTA
#

wget ftp://mirbase.org/pub/mirbase/21/hairpin.fa.gz # Get the sequence

zcat hairpin.fa.gz | perl -pe '/^>/ ? print "\n" : chomp' \
    | tail -n +2 | sed '/^[^>]/ y/uU/tT/' \
    > hairpin.oneline.fasta # Convert to one-line fasta and replace all Us for Ts

## replace_first_line.sh
#!/bin/bash
#
# Replace first line in a file
#

# Insert first line
echo -e "a\nb" | sed '1i transcript_id\tlength'

# Replace - remove and replace first line
echo -e "a\nb" | tail -n+2 | sed '1i transcript_id\tlength'

## remap_bam.sh
#!/bin/bash
#
# Quickly remap reads from BAM file to new reference
#
# Source: https://twitter.com/lh3lh3/status/1132756684789768202
# More details: https://janbio.home.blog/2019/05/30/useful-one-liners-collection/
#

samtools collate -uOn128 old-pos-srt.bam tmpxyz \
  | samtools fastq - \
	#!/bin/bash
	#
	# Speed up deepTools computeMatrix by splitting the references into smaller chunks and then merging the matrices together
	#

	positions=5000
	threads=12

	rnd=$RANDOM
	#!/usr/bin/python3
	#
	# Python script for programtic access to Revigo. Run it with (last output file name is optional):
	# python3 revigo.py example.csv 9606
	# http://revigo.irb.hr/FAQ.aspx -> How do I integrate Revigo with my service or a programming language? -> The advanced job submitting method
	#
	# Make revigo python virtual env.
	#python3 -m venv revigo
	#source revigo/bin/activate
	#pip install --upgrade pip
	#
	# Splitting vector into smaller chunks by a separator and extracting chunk(s)
	#

	split_vector <- function (x, sep, column){
	# sapply(x, function(x) strsplit(x, split = sep, fixed = T)[[1]][column], USE.NAMES = F)
	unlist(lapply(x, function(x) strsplit(x, sep, fixed = T)[[1]][column]))
	}

	# Test
	# Get all the sequences from fastq.gz
	zcat in.fastq.gz \| awk 'NR == 2 \|\| NR % 4 == 2'

	# Get all the sequence names from fastq.gz
	zcat in.fastq.gz \| awk 'NR == 1 \|\| NR % 4 == 1'
	#!/bin/bash
	#
	# Quickly extract unique read names from SAM/BAM file
	# Source: https://www.biostars.org/p/371705/#371748
	#
	# This will extract unique read names for all unampped reads (-f 4) from BAM file
	# awk is much faster than `sort --parallel=$threads \| uniq` because it doesn't have to do the sorting
	# On BAM file with 149,909,118 input reads:
	# sort \| uniq takes (with 12 threads):
	#real 1m8.240s
	#!/usr/bin/python3

	import sys

	# If intab (input) is defined, read it; if not assume stdin
	if intab:
	f = open(intab, "r")
	else:
	f = sys.stdin
	#!/bin/bash
	#
	# Replace one or more spaces (characters) with sed
	# Important: there are two spaces before the * character
	# Note: If you don't quote $text, echo will strip multiple spaces to just one
	#

	text="hello world"
	echo "$text" \| sed 's/ */X/g'
	#helloXworld
	#!/bin/bash
	#
	# Get miRBase hairpin FASTA (multiline, with U instead of T) and reformat it to "normal" reference FASTA
	#

	wget ftp://mirbase.org/pub/mirbase/21/hairpin.fa.gz # Get the sequence

	zcat hairpin.fa.gz \| perl -pe '/^>/ ? print "\n" : chomp' \
	\| tail -n +2 \| sed '/^[^>]/ y/uU/tT/' \
	> hairpin.oneline.fasta # Convert to one-line fasta and replace all Us for Ts
	#!/bin/bash
	#
	# Replace first line in a file
	#

	# Insert first line
	echo -e "a\nb" \| sed '1i transcript_id\tlength'

	# Replace - remove and replace first line
	echo -e "a\nb" \| tail -n+2 \| sed '1i transcript_id\tlength'
	#!/bin/bash
	#
	# Quickly remap reads from BAM file to new reference
	#
	# Source: https://twitter.com/lh3lh3/status/1132756684789768202
	# More details: https://janbio.home.blog/2019/05/30/useful-one-liners-collection/
	#

	samtools collate -uOn128 old-pos-srt.bam tmpxyz \
	\| samtools fastq - \