Fiorella Schischlik sp00nman

## zip_files.sh
# Credits to Michael Schuster for this useful onliner
# find . -name '*.fastq' finds all fastqs in current directory
# -print --> print the processed files
# -exec gzip --best {} --> the gzip command for each file; gzips the file (keeps timestamp) and removes the orginial
# \ --> escapes the semicolon from bash
# ; --> end of exec command line
find . -name '*.fastq' -print -exec gzip --best {} \;

## jupyter_notebooks.md

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              0 stars
            
          
                sp00nman
                / jupyter_notebooks.md
            
            
              Last active
              March 16, 2019 13:55
                — forked from afrendeiro/jupyter_notebooks.md
            
              
                A list of jupyter notebooks in bioinformatics
              
          
    Introduction to linux command line/ipython/git/R for bioinformatics:


https://github.com/biom262/biom262-2016
https://github.com/ricket-sjtu/bi028
https://github.com/YeoLab/single-cell-bioinformatics/
https://github.com/YeoLab/single-cell-bioinformatics/tree/master/notebooks
https://github.com/ltminty/Bioinformatics-and-R/blob/master/Rbioinformatics.ipynb

RNA-seq:


https://github.com/Caroldm/RNA-Seq-Tutorial---Jupyter-notebook/blob/master/RNAseq%20tutorial.ipynb


## id_conversion.R
#! /usr/bin/env Rscript

suppressPackageStartupMessages(expr = library(package = "rtracklayer"))

reference_granges <- import(con ='file1.gtf')
reference_frame <- unique(x = data.frame(
  ensembl_gene_id = reference_granges$gene_id,
  ensembl_transcript_id = reference_granges$transcript_id,
  stringsAsFactors = FALSE))
rm(reference_granges)

## sort_bed.sh
#!/bin/bash

BED_FILE=$1

sort -k1,1 -k2,2n $BED_FILE >${BED_FILE}_sorted.bed

## format
#chr1  12
#chr2  56
#chr10 13

## rsync_repository.sh
# Gist kindly provided by M.Schuster (https://github.com/mkschuster)

rsync \
    --verbose \
    --recursive \
    --delete \
    --times \
    --rsh 'ssh' \
    --exclude '*~' \
    --exclude '.DS_Store' \

## reformat_bed_chr
#Example_input.bed
#ENST00000456328 1 + 11868 14409 11868 11868 3 11868,12612,13220, 12227,12721,14409, DDX11L1

awk '{chrom=$2; gsub(chrom,"chr"chrom,$2)}{print $0}' Example_input.bed >Example_output.bed

#Example_output.bed
#ENST00000456328 chr1 + 11868 14409 11868 11868 3 11868,12612,13220, 12227,12721,14409, DDX11L1

## merge_mulitplexed_samples
#!/bin/bash

# UPD...unique patient ID; file with UPDs
# eg. H_0047C\nH_0060A\nH_0062D\n... \n newline separated
UPD=$1

# SPATH...search path
SPATH=$2

# FEXTENSION...file extension to search for [eg. .fastq.gz]

## calculate_intron_length_gtf
#!/bin/awk -f

BEGIN{OFS="\t"}
{ start[NR]=$4;
  end[NR]=$5;
  strand[NR]=$7;
  ID[NR]=$11;
  chr[NR]=$1;
  ens[NR]=$9;
  symb[NR]=$10}

## merge_files_column
awk -F"\t" 'NR==FNR {f1[$2]=$0;next}($4 in f1){print $0"\t"f1[$4]}' file1 file2

## gtf_transcript_length
# resource: http://seqanswers.com/forums/showthread.php?t=4914
# Calculate length for each transcript for a GTF file
awk -F"\t" '
$3=="exon"
    {
        ID=substr($9, length($9)-16, 15);
        L[ID]+=$5-$4+1
    }
END{
    for(i in L)
	# Credits to Michael Schuster for this useful onliner
	# find . -name '*.fastq' finds all fastqs in current directory
	# -print --> print the processed files
	# -exec gzip --best {} --> the gzip command for each file; gzips the file (keeps timestamp) and removes the orginial
	# \ --> escapes the semicolon from bash
	# ; --> end of exec command line
	find . -name '*.fastq' -print -exec gzip --best {} \;
	#! /usr/bin/env Rscript

	suppressPackageStartupMessages(expr = library(package = "rtracklayer"))

	reference_granges <- import(con ='file1.gtf')
	reference_frame <- unique(x = data.frame(
	ensembl_gene_id = reference_granges$gene_id,
	ensembl_transcript_id = reference_granges$transcript_id,
	stringsAsFactors = FALSE))
	rm(reference_granges)
	#!/bin/bash

	BED_FILE=$1

	sort -k1,1 -k2,2n $BED_FILE >${BED_FILE}_sorted.bed

	## format
	#chr1 12
	#chr2 56
	#chr10 13
	# Gist kindly provided by M.Schuster (https://github.com/mkschuster)

	rsync \
	--verbose \
	--recursive \
	--delete \
	--times \
	--rsh 'ssh' \
	--exclude '*~' \
	--exclude '.DS_Store' \
	#Example_input.bed
	#ENST00000456328 1 + 11868 14409 11868 11868 3 11868,12612,13220, 12227,12721,14409, DDX11L1

	awk '{chrom=$2; gsub(chrom,"chr"chrom,$2)}{print $0}' Example_input.bed >Example_output.bed

	#Example_output.bed
	#ENST00000456328 chr1 + 11868 14409 11868 11868 3 11868,12612,13220, 12227,12721,14409, DDX11L1
	#!/bin/bash

	# UPD...unique patient ID; file with UPDs
	# eg. H_0047C\nH_0060A\nH_0062D\n... \n newline separated
	UPD=$1

	# SPATH...search path
	SPATH=$2

	# FEXTENSION...file extension to search for [eg. .fastq.gz]
	#!/bin/awk -f

	BEGIN{OFS="\t"}
	{ start[NR]=$4;
	end[NR]=$5;
	strand[NR]=$7;
	ID[NR]=$11;
	chr[NR]=$1;
	ens[NR]=$9;
	symb[NR]=$10}
	# resource: http://seqanswers.com/forums/showthread.php?t=4914
	# Calculate length for each transcript for a GTF file
	awk -F"\t" '
	$3=="exon"
	{
	ID=substr($9, length($9)-16, 15);
	L[ID]+=$5-$4+1
	}
	END{
	for(i in L)