Simple overview of use/purpose.
An in-depth paragraph about your project and overview of use.
The repository for the assignment is public and Github does not allow the creation of private forks for public repositories.
The correct way of creating a private frok by duplicating the repo is documented here.
For this assignment the commands are:
git clone --bare git@github.com:usi-systems/easytrace.git
## Usage Function | |
usage() | |
{ | |
cat << EOF | |
usage: $0 options | |
This script is used to calculate the MD5SUM of all files in a given directory. | |
REQUIRED ARGUMENTS: | |
-f Input file |
## Make sure you have the development version of biomaRt installed. If not then run: | |
## BiocInstaller::biocLite('grimbough/biomaRt') | |
library(biomaRt) | |
mart = useEnsembl("ENSEMBL_MART_ENSEMBL") | |
mart=useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl") | |
bmIDs = getBM(attributes=c('ensembl_gene_id','ensembl_transcript_id', | |
+ 'description', | |
+ 'chromosome_name', |
## Row median scale data | |
rowMedians <- rowMedians(as.matrix(human_TPM_pam50_selected)) | |
human_TPM_pam50_selected_scaled <- human_TPM_pam50_selected - rowMedians |
## From forum post https://bioinformatics.stackexchange.com/questions/2586/how-to-apply-upperquartile-normalization-on-rsem-expected-counts | |
## this uses all data in the quantile function, including those that are not expressed | |
## Assumes genes are rows and samples are columns | |
data.quantileAll <- apply(unnormalized_mtx, 2, function(x){quantile(x, 0.75)}) | |
## this uses onlt the values that are greater than zero. | |
##data.quantileExpressed <- apply(unnormalized_mtx, 2, function(x){quantile(x[x>0], 0.75)}); | |
data.norm <- t(t(unnormalized_mtx) / data.quantileAll) |
## Amy Olex | |
## PAM50 Classification of PDX samples. | |
library(genefu) | |
setwd("~/my/working/folder") | |
## import expression data | |
tpm <- read.delim("Gene_Expression_Data_TPMvalues.txt") |
library(AnnotationHub) | |
library(ensembldb) | |
ah <- AnnotationHub() | |
annot_h <- query(ah, patter=c("Homo","EnsDb", "87")) | |
EnsDb_h <- annot_h[[1]] | |
df_human_transcripts <- transcripts(EnsDb_h, return.type = "DataFrame") ## pulls out transcript IDs | |
df_human_gene <- genes(EnsDb_h, return.type = "DataFrame") ## pulls out gene IDs |
#!/bin/bash | |
### Author ### | |
# Amy Olex | |
# 3/12/15 | |
# Function: maxjobs | |
# | |
### Description ### | |
# Essentially a helper function to control how many jobs get started at the same time. | |
# You will need to source this function in any script you wish to use it in. |
## Instructions for working with subtypes in TCGABiolinks: https://www.bioconductor.org/packages/devel/bioc/vignettes/TCGAbiolinks/inst/doc/tcgaBiolinks.html#tcgaquery_subtype:_working_with_molecular_subtypes_data | |
## Note: the subtypes are from 2012 paper, so do not include ALL TCGA samples. | |
## https://support.bioconductor.org/p/91855/ | |
source("http://www.bioconductor.org/biocLite.R") | |
library(TCGAbiolinks) | |
#### RNASeq Gene Expression | |
## query.gene <- GDCquery(project="TCGA-BRCA", sample.type="Primary solid Tumor", data.category="Transcriptome Profiling", data.type="Gene Expression Quantification", workflow.type="HTSeq - Counts") |