Jan Oppelt opplatek

## subset_bam.sh
#!/bin/bash
#
# Subsample BAM to $SUBSAMPLE_MAPPINGS mappings (approximately)
#
# TODO: Make subsampling by number of reads, not mappings (a single read can be mapped multiple times so the final number
#   or "lines" will be different than the $SUBSAMPLE_READS)
# check out https://www.biostars.org/p/9592303/#9593989
#

## bashrc_diff
$ diff /etc/skel/.bashrc ~/.bashrc # to apply the diff run $ patch bashrcToBeChanged bashrc_diff
58a59,64
> ## Add git branch name to prompt https://coderwall.com/p/fasnya/add-git-branch-name-to-bash-prompt
> parse_git_branch() {
>      git branch 2> /dev/null | sed -e '/^[^*]/d' -e 's/* \(.*\)/ (\1)/'
> }
> #export PS1="\u@\h \[\033[32m\]\w\[\033[33m\]\$(parse_git_branch)\[\033[00m\] $ "
>
60c66,67
<     PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '

## bash_aliases
alias 'h'='head'
alias haed="head" # My most common typo
alias 'gitl'='git log -p'
alias 'p'='pwd'
alias 'dockeri'='docker images'
alias 'cleanswap'='sudo swapoff -a; sudo swapon -a'
alias 'python3.8unit'='python3.8 -m unittest'

function gitreset() {
  local git_branch_name=$(git rev-parse --abbrev-ref HEAD)

## fill_number_digits.sh
#!/bin/bash

DIGIT=52
TARGET_DIGITS=4

# Easier
echo $(printf "%0${TARGET_DIGITS}d" ${DIGIT})
#0052

# More complicated

## length-of-string.sh
#!/bin/bash

VAR="abcd"

echo ${#VAR}

## parse-gitlab-projects.py
#!/usr/bin/env python3
#
# Read GitLab raw JSON and extract ssh url
#

import sys
import json

def filter_dicts(list_of_dicts, value_to_check):
    filtered_list = [item for item in list_of_dicts if not item.get(value_to_check, True)]

## read_file_list.R
#
# Read a list of files (TSV) and merge them to one data.frame
#

read_tsv <- function(x){
  df.file <- read.table(file = x, header = T, stringsAsFactors = F, sep = "\t")
  return(df.file)
}

files <- c("a.tsv", "b.tsv", "c.tsv")

## awk_median_and_mean.sh
#!/bin/bash
#
# Calculate median and mean with AWK from a specific column
#

COL_NUMBER=1 # column number with values to calculate median/mean from

MEAN=$(echo -e "1\n10\n5\n4\n3" | awk -v N=$COL_NUMBER '{ sum += $N } END { if (NR > 0) print sum / NR }')
MEDIAN=$(echo -e "1\n10\n5\n4\n3" | cut -f${COL_NUMBER} | sort -n | awk ' { a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }')


## extract_subcolumn_info_awk
# We have a file that has a column with multiple additional "subcolumns" separated by the ; character
# We want to extract a specific subcolumn from the "merged" column that starts with a specific word

# Get 1000 Genomes project VCF
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/integrated_sv_map/supporting/GRCh38_positions/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz

# This VCF has a INFO column (column no. 8) that stores additonal information
zcat ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz | grep -v "^#"| cut -f8 | head
#AC=35;AF=0.00698882;AFR_AF=0;AMR_AF=0.0072;AN=5008;CS=ALU_umary;EAS_AF=0.0069;EUR_AF=0.0189;MEINFO=AluYa4_5,1,223,-;NS=2504;SAS_AF=0.0041;SITEPOST=0.9998;SVLEN=222;SVTYPE=ALU;TSD=null
#AC=64;AF=0.0127795;AFR_AF=0.0015;AMR_AF=0;AN=5008;CIEND=-150,150;CIPOS=-150,150;CS=DUP_delly;EAS_AF=0.0595;END=914824;EUR_AF=0.001;IMPRECISE;NS=2504;SAS_AF=0.001;SITEPOST=1;SVTYPE=DUP

## apply-to-df.R
#
# Convert apply() result to data.frame
#

# Sample data frame
df <- data.frame(A = c(1, 2, 3),
                 B = c(4, 5, 6))

# Sample function
calculate_sum <- function(row) {
	#!/bin/bash
	#
	# Subsample BAM to $SUBSAMPLE_MAPPINGS mappings (approximately)
	#
	# TODO: Make subsampling by number of reads, not mappings (a single read can be mapped multiple times so the final number
	# or "lines" will be different than the $SUBSAMPLE_READS)
	# check out https://www.biostars.org/p/9592303/#9593989
	#
	$ diff /etc/skel/.bashrc ~/.bashrc # to apply the diff run $ patch bashrcToBeChanged bashrc_diff
	58a59,64
	> ## Add git branch name to prompt https://coderwall.com/p/fasnya/add-git-branch-name-to-bash-prompt
	> parse_git_branch() {
	> git branch 2> /dev/null \| sed -e '/^[^]/d' -e 's/ \(.*\)/ (\1)/'
	> }
	> #export PS1="\u@\h \[\033[32m\]\w\[\033[33m\]\$(parse_git_branch)\[\033[00m\] $ "
	>
	60c66,67
	< PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
	alias 'h'='head'
	alias haed="head" # My most common typo
	alias 'gitl'='git log -p'
	alias 'p'='pwd'
	alias 'dockeri'='docker images'
	alias 'cleanswap'='sudo swapoff -a; sudo swapon -a'
	alias 'python3.8unit'='python3.8 -m unittest'

	function gitreset() {
	local git_branch_name=$(git rev-parse --abbrev-ref HEAD)
	#!/bin/bash

	DIGIT=52
	TARGET_DIGITS=4

	# Easier
	echo $(printf "%0${TARGET_DIGITS}d" ${DIGIT})
	#0052

	# More complicated
	#!/usr/bin/env python3
	#
	# Read GitLab raw JSON and extract ssh url
	#

	import sys
	import json

	def filter_dicts(list_of_dicts, value_to_check):
	filtered_list = [item for item in list_of_dicts if not item.get(value_to_check, True)]
	#
	# Read a list of files (TSV) and merge them to one data.frame
	#

	read_tsv <- function(x){
	df.file <- read.table(file = x, header = T, stringsAsFactors = F, sep = "\t")
	return(df.file)
	}

	files <- c("a.tsv", "b.tsv", "c.tsv")
	#!/bin/bash
	#
	# Calculate median and mean with AWK from a specific column
	#

	COL_NUMBER=1 # column number with values to calculate median/mean from

	MEAN=$(echo -e "1\n10\n5\n4\n3" \| awk -v N=$COL_NUMBER '{ sum += $N } END { if (NR > 0) print sum / NR }')
	MEDIAN=$(echo -e "1\n10\n5\n4\n3" \| cut -f${COL_NUMBER} \| sort -n \| awk ' { a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }')
	# We have a file that has a column with multiple additional "subcolumns" separated by the ; character
	# We want to extract a specific subcolumn from the "merged" column that starts with a specific word

	# Get 1000 Genomes project VCF
	wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/integrated_sv_map/supporting/GRCh38_positions/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz

	# This VCF has a INFO column (column no. 8) that stores additonal information
	zcat ALL.wgs.mergedSV.v8.20130502.svs.genotypes.GRCh38.vcf.gz \| grep -v "^#"\| cut -f8 \| head
	#AC=35;AF=0.00698882;AFR_AF=0;AMR_AF=0.0072;AN=5008;CS=ALU_umary;EAS_AF=0.0069;EUR_AF=0.0189;MEINFO=AluYa4_5,1,223,-;NS=2504;SAS_AF=0.0041;SITEPOST=0.9998;SVLEN=222;SVTYPE=ALU;TSD=null
	#AC=64;AF=0.0127795;AFR_AF=0.0015;AMR_AF=0;AN=5008;CIEND=-150,150;CIPOS=-150,150;CS=DUP_delly;EAS_AF=0.0595;END=914824;EUR_AF=0.001;IMPRECISE;NS=2504;SAS_AF=0.001;SITEPOST=1;SVTYPE=DUP
	#
	# Convert apply() result to data.frame
	#

	# Sample data frame
	df <- data.frame(A = c(1, 2, 3),
	B = c(4, 5, 6))

	# Sample function
	calculate_sum <- function(row) {