slava ilnytskyy slavailn

## GEOquery_download.R
library(GEOquery)

# Download existing RA datasets

# Study 1
# Teixeira VH, Olaso R, Martin-Magniette ML, Lasbleiz S et al. Transcriptome
# analysis describing new immunity and defense genes in peripheral blood
# mononuclear cells of rheumatoid arthritis patients.
# PLoS One 2009 Aug 27;4(8):e6803. PMID: 19710928
# Reference: 	GSE15573

## get_snps_by_rsids.sh
# Get SNP records based on rs ids from VCF file
# Kevin Blighe recipe from https://www.biostars.org/p/373852/

# Create a file with snp ids of interest, one id per line
# We can give the newly created file any name, for example snp.txt

bcftools view --include ID==@snp.txt target.bcf

## fetch_fastq.sh
# Taken from https://www.biostars.org/p/111040/
# Examine and save metadata
esearch -db sra -query PRJNA484081  | efetch -format runinfo > bioproj.csv

# The first column of comma separated runinfo file are run ids
cat bioproj.csv | cut -d ',' -f 1 | head

# Download first 4 files as an example
cat bioproj.csv | cut -d ',' -f 1 | grep 'SRR' # Check of we are selecting right files

## merge_lanes.sh
#! /bin/bash

# Taken from https://github.com/stephenturner/mergelanes/issues/1
# Exercise caution, does not work accurately in every case:
# Not working accurately for sample IDs like "A11_Barcodexxx_S11_L001_R1_001".
# It cat together all L001 pertaining to sample ID A11 with different barcodes also

ls *R1* | cut -d _ -f 1 | sort | uniq \
    | while read id; do \
        cat $id*R1*.fastq.gz > $id.R1.fastq.gz;

## entrez_to_KEGG.R
library(KEGGREST)
library(org.Rn.eg.db)

# Download entrez ids and corresponding KEGG pathways followed
# by creation of a table where one column is entrez id and another
# column is a comma separated list of KEGG pathways

# Download pathway to entrez id relationship
rno_path_eg  <- keggLink("pathway", "rno")
names(rno_path_eg) <- gsub("rno:", "", names(rno_path_eg))

## Two_factor_anova_DESeq2.R
library(RColorBrewer)
library(ggplot2)
library(ggrepel)

setwd("<path/to/dir>")

# Load DESeq2 object
load("expression_data/DESeqOBJ.RData")
dds

## build_org_db.R
library(AnnotationForge)
library(biomaRt)
setwd("path/to/dir")

## Get annotation data frame for Ovis aries from BiomaRt
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="https://www.ensembl.org")

ensembl <- useDataset("oaries_gene_ensembl", mart=ensembl)
#126 oaries_gene_ensembl Sheep (texel) genes (Oar_v3.1) Oar_v3.1

## enrichment_bubble.R
library(ggplot2)
library(stringr)

theme_set(
  theme_bw() +
    theme(legend.position = "right")
)

ggplot(all_go, aes(x = sample_id, y = reorder(GO.label, Enrichment))) +
  geom_point(aes(size = Enrichment, fill = P.value), alpha = 0.75, shape = 21) +

## problem and solution when running RSEM, for future reference
In one of the conda environments I encountered the a bowtie2 installation problem breaking RSEM run.
The solution to the problem is described in the following Biostars post:
https://www.biostars.org/p/494922/

See the post content below:

```

$ conda create -n bttest -c bioconda bowtie2
$ conda activate bttest

## biomaRt_paralogs,R
# Get paralogs for the list of genes

library(biomaRt)
human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
gene_id <- c("TPM1", "BOD1", "ADAP1")
results <- getBM(attributes = c("ensembl_gene_id",
                                "external_gene_name",
                                "hsapiens_paralog_ensembl_gene",
                                "hsapiens_paralog_associated_gene_name"),
                 filters = "external_gene_name",
	library(GEOquery)

	# Download existing RA datasets

	# Study 1
	# Teixeira VH, Olaso R, Martin-Magniette ML, Lasbleiz S et al. Transcriptome
	# analysis describing new immunity and defense genes in peripheral blood
	# mononuclear cells of rheumatoid arthritis patients.
	# PLoS One 2009 Aug 27;4(8):e6803. PMID: 19710928
	# Reference: GSE15573
	# Get SNP records based on rs ids from VCF file
	# Kevin Blighe recipe from https://www.biostars.org/p/373852/

	# Create a file with snp ids of interest, one id per line
	# We can give the newly created file any name, for example snp.txt

	bcftools view --include ID==@snp.txt target.bcf
	# Taken from https://www.biostars.org/p/111040/
	# Examine and save metadata
	esearch -db sra -query PRJNA484081 \| efetch -format runinfo > bioproj.csv

	# The first column of comma separated runinfo file are run ids
	cat bioproj.csv \| cut -d ',' -f 1 \| head

	# Download first 4 files as an example
	cat bioproj.csv \| cut -d ',' -f 1 \| grep 'SRR' # Check of we are selecting right files
	#! /bin/bash

	# Taken from https://github.com/stephenturner/mergelanes/issues/1
	# Exercise caution, does not work accurately in every case:
	# Not working accurately for sample IDs like "A11_Barcodexxx_S11_L001_R1_001".
	# It cat together all L001 pertaining to sample ID A11 with different barcodes also

	ls R1 \| cut -d _ -f 1 \| sort \| uniq \
	\| while read id; do \
	cat $idR1.fastq.gz > $id.R1.fastq.gz;
	library(KEGGREST)
	library(org.Rn.eg.db)

	# Download entrez ids and corresponding KEGG pathways followed
	# by creation of a table where one column is entrez id and another
	# column is a comma separated list of KEGG pathways

	# Download pathway to entrez id relationship
	rno_path_eg <- keggLink("pathway", "rno")
	names(rno_path_eg) <- gsub("rno:", "", names(rno_path_eg))
	library(RColorBrewer)
	library(ggplot2)
	library(ggrepel)

	setwd("<path/to/dir>")

	# Load DESeq2 object
	load("expression_data/DESeqOBJ.RData")
	dds
	library(AnnotationForge)
	library(biomaRt)
	setwd("path/to/dir")

	## Get annotation data frame for Ovis aries from BiomaRt
	ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="https://www.ensembl.org")

	ensembl <- useDataset("oaries_gene_ensembl", mart=ensembl)
	#126 oaries_gene_ensembl Sheep (texel) genes (Oar_v3.1) Oar_v3.1
	library(ggplot2)
	library(stringr)

	theme_set(
	theme_bw() +
	theme(legend.position = "right")
	)

	ggplot(all_go, aes(x = sample_id, y = reorder(GO.label, Enrichment))) +
	geom_point(aes(size = Enrichment, fill = P.value), alpha = 0.75, shape = 21) +
	In one of the conda environments I encountered the a bowtie2 installation problem breaking RSEM run.
	The solution to the problem is described in the following Biostars post:
	https://www.biostars.org/p/494922/

	See the post content below:

	```

	$ conda create -n bttest -c bioconda bowtie2
	$ conda activate bttest
	# Get paralogs for the list of genes

	library(biomaRt)
	human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
	gene_id <- c("TPM1", "BOD1", "ADAP1")
	results <- getBM(attributes = c("ensembl_gene_id",
	"external_gene_name",
	"hsapiens_paralog_ensembl_gene",
	"hsapiens_paralog_associated_gene_name"),
	filters = "external_gene_name",