slava ilnytskyy slavailn

## remove_carriage.sh
#! /bin/bash

 sed -i 's/\r//g' file.txt

## gist:28a7967d74b6b50d08402950d7fb794a
#! /bin/bash

# Kegg enrichment
~/programs/geneSCF-master-source-v1.1-p2/geneSCF -m='update' -i='sig_ids.txt' -t='gid' -db='KEGG' -org='mmu' -p='yes' -bg=20000 -o='.'

# Prepare GO database
~/programs/geneSCF-master-source-v1.1-p2/prepare_database -db=GO_all -org=mgi

# Run encrichment against cellular component (CC), biological process (BP), molecular function (MF)
~/programs/geneSCF-master-source-v1.1-p2/geneSCF -m=normal -i=sig_ids.txt -o=. -t=gid -db=GO_CC -bg=20000 --plot=yes -org=mgi

## plot_coverage.R
library("wiggleplotr")
library("dplyr")
library("GenomicRanges")
library("GenomicFeatures")
library("EnsDb.Mmusculus.v79")
library("ensembldb")

setwd(<working_dir>)
list.files()

## cluster_sidebar.R
library(pheatmap)

# We need to use cutree inside pheatmap()
# functio, capture the returned object into a variable,
# and than extract the tree, cut it again to the same
# number of clusters and reorder.
# Resulting vector is converted to factor and used in the annotation data frame
# to create cluster labels

x <- some_matrix # matrix to cluster and show as a heatmap

## gage_wikipath.R
library(gage)
library(org.Hs.eg.db)
library(pathview)
library(rWikiPathways)
library(DESeq2)

## Get annotation from csv file (obtained via biomaRt)
annot <- read.table("../annotation/annotation.csv", sep = ",", header = T,
                    quote = "\"", fill = T, stringsAsFactors = F)

## download_with_geoquery.R
library(GEOquery)

# Get GSE
gse <- getGEO("GSE147507",GSEMatrix=FALSE)
head(Meta(gse))

# names of all the GSM objects contained in the GSE
names(GSMList(gse))

# and get the first GSM object on the list

## get_longest_transcript.R
library("GenomicFeatures")
library("DESeq2")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("BSgenome.Hsapiens.UCSC.hg19")
library("biomaRt")

setwd("<working_dir>")

# Assuming we are working with DESeq2 projects
# get all genes expressed in the project

## SPIAcalc.R
resFile <- "DESeq2_result_name" # has to have entrez ids as annotations
compName <- "comparison" # Name of the comparison

calcSPIA <- function(resFile, compName)
{
  res <- read.table(resFile, header = T, sep = "\t", fill=T, quote = "\"", stringsAsFactors = F)

  # Select differentially expressed genes (adjusted p-value < 0.05)
  # This is a named vector, where names are entrez ids, and values are log2 fold changes
  ids <- res[res$padj < 0.05,]$entrezgene_id

## collect_HISAT2_mapping_stats.sh
#! /bin/bash

# 22498713 reads; of these:
#   22498713 (100.00%) were unpaired; of these:
#     1754404 (7.80%) aligned 0 times
#     17667104 (78.52%) aligned exactly 1 time
#     3077205 (13.68%) aligned >1 times
# 92.20% overall alignment rate

echo -e "sampleID\ttotal_reads\tunmapped\taligned_one_time\taligned_multiple\talignment_rate"

## edgeR_GLM_smallRNA.R
library(edgeR)

setwd(<dir>)

files <- list.files("/tag_counts/", full.names = T) # tab delimited file with sequenca tags and raw counts
files
d <- readDGE(files, columns = c(1,2))
counts <- d$counts
sampleFile <- <sampleFile>
sampleInfo <- read.table(sampleFile, sep = "\t", header = T, stringsAsFactors = F)
	#! /bin/bash

	# Kegg enrichment
	~/programs/geneSCF-master-source-v1.1-p2/geneSCF -m='update' -i='sig_ids.txt' -t='gid' -db='KEGG' -org='mmu' -p='yes' -bg=20000 -o='.'

	# Prepare GO database
	~/programs/geneSCF-master-source-v1.1-p2/prepare_database -db=GO_all -org=mgi

	# Run encrichment against cellular component (CC), biological process (BP), molecular function (MF)
	~/programs/geneSCF-master-source-v1.1-p2/geneSCF -m=normal -i=sig_ids.txt -o=. -t=gid -db=GO_CC -bg=20000 --plot=yes -org=mgi
	library("wiggleplotr")
	library("dplyr")
	library("GenomicRanges")
	library("GenomicFeatures")
	library("EnsDb.Mmusculus.v79")
	library("ensembldb")

	setwd(<working_dir>)
	list.files()
	library(pheatmap)

	# We need to use cutree inside pheatmap()
	# functio, capture the returned object into a variable,
	# and than extract the tree, cut it again to the same
	# number of clusters and reorder.
	# Resulting vector is converted to factor and used in the annotation data frame
	# to create cluster labels

	x <- some_matrix # matrix to cluster and show as a heatmap
	library(gage)
	library(org.Hs.eg.db)
	library(pathview)
	library(rWikiPathways)
	library(DESeq2)

	## Get annotation from csv file (obtained via biomaRt)
	annot <- read.table("../annotation/annotation.csv", sep = ",", header = T,
	quote = "\"", fill = T, stringsAsFactors = F)
	library(GEOquery)

	# Get GSE
	gse <- getGEO("GSE147507",GSEMatrix=FALSE)
	head(Meta(gse))

	# names of all the GSM objects contained in the GSE
	names(GSMList(gse))

	# and get the first GSM object on the list
	library("GenomicFeatures")
	library("DESeq2")
	library("TxDb.Hsapiens.UCSC.hg19.knownGene")
	library("BSgenome.Hsapiens.UCSC.hg19")
	library("biomaRt")

	setwd("<working_dir>")

	# Assuming we are working with DESeq2 projects
	# get all genes expressed in the project
	resFile <- "DESeq2_result_name" # has to have entrez ids as annotations
	compName <- "comparison" # Name of the comparison

	calcSPIA <- function(resFile, compName)
	{
	res <- read.table(resFile, header = T, sep = "\t", fill=T, quote = "\"", stringsAsFactors = F)

	# Select differentially expressed genes (adjusted p-value < 0.05)
	# This is a named vector, where names are entrez ids, and values are log2 fold changes
	ids <- res[res$padj < 0.05,]$entrezgene_id
	#! /bin/bash

	# 22498713 reads; of these:
	# 22498713 (100.00%) were unpaired; of these:
	# 1754404 (7.80%) aligned 0 times
	# 17667104 (78.52%) aligned exactly 1 time
	# 3077205 (13.68%) aligned >1 times
	# 92.20% overall alignment rate

	echo -e "sampleID\ttotal_reads\tunmapped\taligned_one_time\taligned_multiple\talignment_rate"
	library(edgeR)

	setwd(<dir>)

	files <- list.files("/tag_counts/", full.names = T) # tab delimited file with sequenca tags and raw counts
	files
	d <- readDGE(files, columns = c(1,2))
	counts <- d$counts
	sampleFile <- <sampleFile>
	sampleInfo <- read.table(sampleFile, sep = "\t", header = T, stringsAsFactors = F)