slava ilnytskyy slavailn

## GEOquery_download.R
library(GEOquery)

# Download existing RA datasets

# Study 1
# Teixeira VH, Olaso R, Martin-Magniette ML, Lasbleiz S et al. Transcriptome
# analysis describing new immunity and defense genes in peripheral blood
# mononuclear cells of rheumatoid arthritis patients.
# PLoS One 2009 Aug 27;4(8):e6803. PMID: 19710928
# Reference: 	GSE15573

## get_snps_by_rsids.sh
# Get SNP records based on rs ids from VCF file
# Kevin Blighe recipe from https://www.biostars.org/p/373852/

# Create a file with snp ids of interest, one id per line
# We can give the newly created file any name, for example snp.txt

bcftools view --include ID==@snp.txt target.bcf

## fetch_fastq.sh
# Taken from https://www.biostars.org/p/111040/
# Examine and save metadata
esearch -db sra -query PRJNA484081  | efetch -format runinfo > bioproj.csv

# The first column of comma separated runinfo file are run ids
cat bioproj.csv | cut -d ',' -f 1 | head

# Download first 4 files as an example
cat bioproj.csv | cut -d ',' -f 1 | grep 'SRR' # Check of we are selecting right files

## merge_lanes.sh
#! /bin/bash

# Taken from https://github.com/stephenturner/mergelanes/issues/1
# Exercise caution, does not work accurately in every case:
# Not working accurately for sample IDs like "A11_Barcodexxx_S11_L001_R1_001".
# It cat together all L001 pertaining to sample ID A11 with different barcodes also

ls *R1* | cut -d _ -f 1 | sort | uniq \
    | while read id; do \
        cat $id*R1*.fastq.gz > $id.R1.fastq.gz;

## biomaRt_paralogs,R
# Get paralogs for the list of genes

library(biomaRt)
human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
gene_id <- c("TPM1", "BOD1", "ADAP1")
results <- getBM(attributes = c("ensembl_gene_id",
                                "external_gene_name",
                                "hsapiens_paralog_ensembl_gene",
                                "hsapiens_paralog_associated_gene_name"),
                 filters = "external_gene_name",

## fetch_promoter_seq.R
library(GenomicFeatures)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(BSgenome.Mmusculus.UCSC.mm10)
library(Biostrings)
# Fetch promoter sequences to detect enriched transcription factor
# motifs in the promoters of differentially expressed genes

setwd("/path/to/dir")
list.files()

## annotate_MACS2_peaks.R
library("ChIPpeakAnno")
library("GenomicRanges")
library("org.At.tair.db")
library("TxDb.Athaliana.BioMart.plantsmart28")
library("biomaRt")

# Annotate genomic intervals in bed format using ChIPpeakAnno
# This script was designed for Arabidopsis, but can be easily changed for
# any other organism available through biomaRt

## entrez_to_KEGG.R
library(KEGGREST)
library(org.Rn.eg.db)

# Download entrez ids and corresponding KEGG pathways followed
# by creation of a table where one column is entrez id and another
# column is a comma separated list of KEGG pathways

# Download pathway to entrez id relationship
rno_path_eg  <- keggLink("pathway", "rno")
names(rno_path_eg) <- gsub("rno:", "", names(rno_path_eg))

## topGO_nonModel_organism.R
library(dplyr)
library(ggplot2)
library(reshape2)
library(topGO)
library(plyr)

# Read in some table with gene ids, for example expressions
coreGenes <- read.table(<some_table_gene_ids>)

# Get gene mappings to GO terms

## Two_factor_anova_DESeq2.R
library(RColorBrewer)
library(ggplot2)
library(ggrepel)

setwd("<path/to/dir>")

# Load DESeq2 object
load("expression_data/DESeqOBJ.RData")
dds
	library(GEOquery)

	# Download existing RA datasets

	# Study 1
	# Teixeira VH, Olaso R, Martin-Magniette ML, Lasbleiz S et al. Transcriptome
	# analysis describing new immunity and defense genes in peripheral blood
	# mononuclear cells of rheumatoid arthritis patients.
	# PLoS One 2009 Aug 27;4(8):e6803. PMID: 19710928
	# Reference: GSE15573
	# Get SNP records based on rs ids from VCF file
	# Kevin Blighe recipe from https://www.biostars.org/p/373852/

	# Create a file with snp ids of interest, one id per line
	# We can give the newly created file any name, for example snp.txt

	bcftools view --include ID==@snp.txt target.bcf
	# Taken from https://www.biostars.org/p/111040/
	# Examine and save metadata
	esearch -db sra -query PRJNA484081 \| efetch -format runinfo > bioproj.csv

	# The first column of comma separated runinfo file are run ids
	cat bioproj.csv \| cut -d ',' -f 1 \| head

	# Download first 4 files as an example
	cat bioproj.csv \| cut -d ',' -f 1 \| grep 'SRR' # Check of we are selecting right files
	#! /bin/bash

	# Taken from https://github.com/stephenturner/mergelanes/issues/1
	# Exercise caution, does not work accurately in every case:
	# Not working accurately for sample IDs like "A11_Barcodexxx_S11_L001_R1_001".
	# It cat together all L001 pertaining to sample ID A11 with different barcodes also

	ls R1 \| cut -d _ -f 1 \| sort \| uniq \
	\| while read id; do \
	cat $idR1.fastq.gz > $id.R1.fastq.gz;
	# Get paralogs for the list of genes

	library(biomaRt)
	human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
	gene_id <- c("TPM1", "BOD1", "ADAP1")
	results <- getBM(attributes = c("ensembl_gene_id",
	"external_gene_name",
	"hsapiens_paralog_ensembl_gene",
	"hsapiens_paralog_associated_gene_name"),
	filters = "external_gene_name",
	library(GenomicFeatures)
	library(TxDb.Mmusculus.UCSC.mm10.knownGene)
	library(BSgenome.Mmusculus.UCSC.mm10)
	library(Biostrings)
	# Fetch promoter sequences to detect enriched transcription factor
	# motifs in the promoters of differentially expressed genes

	setwd("/path/to/dir")
	list.files()
	library("ChIPpeakAnno")
	library("GenomicRanges")
	library("org.At.tair.db")
	library("TxDb.Athaliana.BioMart.plantsmart28")
	library("biomaRt")

	# Annotate genomic intervals in bed format using ChIPpeakAnno
	# This script was designed for Arabidopsis, but can be easily changed for
	# any other organism available through biomaRt
	library(KEGGREST)
	library(org.Rn.eg.db)

	# Download entrez ids and corresponding KEGG pathways followed
	# by creation of a table where one column is entrez id and another
	# column is a comma separated list of KEGG pathways

	# Download pathway to entrez id relationship
	rno_path_eg <- keggLink("pathway", "rno")
	names(rno_path_eg) <- gsub("rno:", "", names(rno_path_eg))
	library(dplyr)
	library(ggplot2)
	library(reshape2)
	library(topGO)
	library(plyr)

	# Read in some table with gene ids, for example expressions
	coreGenes <- read.table(<some_table_gene_ids>)

	# Get gene mappings to GO terms
	library(RColorBrewer)
	library(ggplot2)
	library(ggrepel)

	setwd("<path/to/dir>")

	# Load DESeq2 object
	load("expression_data/DESeqOBJ.RData")
	dds