Dina dinovski

## gtex_tpm.py
#!/usr/bin/env python

import pandas as pd
import numpy as np
import csv
import statistics

# mapping of sample ID to tissue:
#curl https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt -o GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
# TPM per sample per transcript:

## geneBed.sh
#!/bin/bash

BEDTOOLS=/local/build/BEDTools_2.21.0/bin
FEATURECOUNTS=/local/build/subread/subread-1.5.1-Linux-x86_64/bin/featureCounts
R=/local/build/R/R-3.4.0/bin/R

IDIR=/in/path
DIR=/out/path

GTF=gencode.vM13.annotation.gtf

## dispersion.R
#!/usr/bin/Rscript

library(edgeR)
library(limma)

## The BCV is the relative variability of expression between biological replicates
## The square root of the negative binomial dispersion for a gene
## is the biological coefficient of variation (BCV) across replicates (stdev/mean)

## input: ge (raw count table filtered for lowly expressed genes, eg CPM > 1)

## MSIanalysis.R
#!/usr/bin/Rscript
#source("http://bioconductor.org/biocLite.R")
#biocLite("curatedCRCData")
#biocLite("inSilicoMerging")
library(Biobase)
library(GEOquery)
library(rgl)
library(inSilicoMerging)
library(gdata)
library(scatterplot3d)

## inferAncestry.sh
#!/bin/bash

## 1. Imputation of WGS samples with Impute2 (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
## and phasing with Shapeit (http://www.shapeit.fr/)
## 2. File wrangling for local ancestry inference with RFMix (https://github.com/slowkoni/rfmix)
##
## NOTE: 'PopPhased' directory included with RFMix MUST be local to working directory ie. ${ODIR}
## reference data was obtained from http://genetics.med.harvard.edu/reichlab/Reich_Lab/Datasets.html

## Bin paths

## VCFcompare.py
#!/usr/bin/python

import vcf
import sys

Usage="""
python VCFcompare.py ${1KG_VCF} ${CONTROL_VCF} ${CHR} ${OUTFILE}
"""

if len(sys.argv) < 5:

## autoFACS.R
#!/usr/bin/Rscript

## Perform automated gating of FACS data using unsupervised clustering and regression analysis.
## Data is first filtered to exclude debris (low FSC and high SSC) and doublets (FSC-A v FSC-H)
## and the CSV is exported using FlowJo (flowjo.com).

## This analysis was designed to assess the effects of various microsatellite lengths on gene expression
## using a reporter assay. Briefly, varying 'eSTRs' (gBlock gene fragments) were cloned upstream of a CMV
## promoter driving GFP reporter gene expression and co-transfected with an RFP expression plasmid at a
## raio of 2:1. Reference samples of GFP:RFP were co-transfected at various ratios (1:1, 2:1, 3:1, 4:1) and

## uniqueGenes.R
inDir="/bioinfo/users/dzielins/dbase/mouse/mm10/"
genome="mm10"
geneBed="gencode.vM13.annotation_gene.bed"

inBed <- read.table(paste0(inDir, geneBed), header = FALSE, stringsAsFactors = FALSE, sep = "\t")
colnames(inBed) <- c("chr", "start", "end", "gene", "score", "strand")

## Keep min start and max end for each gene
start <- aggregate(start ~ chr + gene + strand, inBed, min)
end <- aggregate(end ~ chr + gene + strand, inBed, max)

## fasta2bed.py
#!/usr/bin/env python

from Bio import SeqIO
import re
import os
import sys
import csv

usage="""
Perform case insensitive motif search of a FASTA file and output 3 BED files: motif_sites.bed (motif coordinates), motif_fragments.bed (coordinates between each motif), and motif_fragments_full.bed (coordinates between and including each motif)

## qualDist.py
#!/usr/bin/env python

usage="""
## quality score distribution from fastq
gunzip -c fastq.gz | python qualDist.py
"""

import sys

num = 0
	#!/usr/bin/env python

	import pandas as pd
	import numpy as np
	import csv
	import statistics

	# mapping of sample ID to tissue:
	#curl https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt -o GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
	# TPM per sample per transcript:
	#!/bin/bash

	BEDTOOLS=/local/build/BEDTools_2.21.0/bin
	FEATURECOUNTS=/local/build/subread/subread-1.5.1-Linux-x86_64/bin/featureCounts
	R=/local/build/R/R-3.4.0/bin/R

	IDIR=/in/path
	DIR=/out/path

	GTF=gencode.vM13.annotation.gtf
	#!/usr/bin/Rscript

	library(edgeR)
	library(limma)

	## The BCV is the relative variability of expression between biological replicates
	## The square root of the negative binomial dispersion for a gene
	## is the biological coefficient of variation (BCV) across replicates (stdev/mean)

	## input: ge (raw count table filtered for lowly expressed genes, eg CPM > 1)
	#!/usr/bin/Rscript
	#source("http://bioconductor.org/biocLite.R")
	#biocLite("curatedCRCData")
	#biocLite("inSilicoMerging")
	library(Biobase)
	library(GEOquery)
	library(rgl)
	library(inSilicoMerging)
	library(gdata)
	library(scatterplot3d)
	#!/bin/bash

	## 1. Imputation of WGS samples with Impute2 (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
	## and phasing with Shapeit (http://www.shapeit.fr/)
	## 2. File wrangling for local ancestry inference with RFMix (https://github.com/slowkoni/rfmix)
	##
	## NOTE: 'PopPhased' directory included with RFMix MUST be local to working directory ie. ${ODIR}
	## reference data was obtained from http://genetics.med.harvard.edu/reichlab/Reich_Lab/Datasets.html

	## Bin paths
	#!/usr/bin/python

	import vcf
	import sys

	Usage="""
	python VCFcompare.py ${1KG_VCF} ${CONTROL_VCF} ${CHR} ${OUTFILE}
	"""

	if len(sys.argv) < 5:
	#!/usr/bin/Rscript

	## Perform automated gating of FACS data using unsupervised clustering and regression analysis.
	## Data is first filtered to exclude debris (low FSC and high SSC) and doublets (FSC-A v FSC-H)
	## and the CSV is exported using FlowJo (flowjo.com).

	## This analysis was designed to assess the effects of various microsatellite lengths on gene expression
	## using a reporter assay. Briefly, varying 'eSTRs' (gBlock gene fragments) were cloned upstream of a CMV
	## promoter driving GFP reporter gene expression and co-transfected with an RFP expression plasmid at a
	## raio of 2:1. Reference samples of GFP:RFP were co-transfected at various ratios (1:1, 2:1, 3:1, 4:1) and
	inDir="/bioinfo/users/dzielins/dbase/mouse/mm10/"
	genome="mm10"
	geneBed="gencode.vM13.annotation_gene.bed"

	inBed <- read.table(paste0(inDir, geneBed), header = FALSE, stringsAsFactors = FALSE, sep = "\t")
	colnames(inBed) <- c("chr", "start", "end", "gene", "score", "strand")

	## Keep min start and max end for each gene
	start <- aggregate(start ~ chr + gene + strand, inBed, min)
	end <- aggregate(end ~ chr + gene + strand, inBed, max)
	#!/usr/bin/env python

	from Bio import SeqIO
	import re
	import os
	import sys
	import csv

	usage="""
	Perform case insensitive motif search of a FASTA file and output 3 BED files: motif_sites.bed (motif coordinates), motif_fragments.bed (coordinates between each motif), and motif_fragments_full.bed (coordinates between and including each motif)
	#!/usr/bin/env python

	usage="""
	## quality score distribution from fastq
	gunzip -c fastq.gz \| python qualDist.py
	"""

	import sys

	num = 0