Amy Olex AmyOlex

## README-Template.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                AmyOlex
                / README-Template.md
            
            
              Created
              July 26, 2023 17:40
                — forked from DomPizzie/README-Template.md
            
              
                A simple README.md template
              
          
    Project Title

Simple overview of use/purpose.
Description

An in-depth paragraph about your project and overview of use.
Getting Started


## private_fork.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                AmyOlex
                / private_fork.md
            
            
              Created
              December 3, 2018 21:21
                — forked from 0xjac/private_fork.md
            
              
                Create a private fork of a public repository
              
          
    The repository for the assignment is public and Github does not allow the creation of private forks for public repositories.
The correct way of creating a private frok by duplicating the repo is documented here.
For this assignment the commands are:

Create a bare clone of the repository.
(This is temporary and will be removed so just do it wherever.)


git clone --bare git@github.com:usi-systems/easytrace.git

  
## cmdargs.sh
## Usage Function
usage()
{
cat << EOF
usage: $0 options
This script is used to calculate the MD5SUM of all files in a given directory.

REQUIRED ARGUMENTS:
   -f      Input file

## Ensemble-to-Entrez.R
## Make sure you have the development version of biomaRt installed. If not then run:
## BiocInstaller::biocLite('grimbough/biomaRt')

library(biomaRt)

mart = useEnsembl("ENSEMBL_MART_ENSEMBL")
mart=useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
bmIDs = getBM(attributes=c('ensembl_gene_id','ensembl_transcript_id',
+                            'description',
+                            'chromosome_name',

## RowMedianCenter
## Row median scale data
rowMedians <- rowMedians(as.matrix(human_TPM_pam50_selected))
human_TPM_pam50_selected_scaled <- human_TPM_pam50_selected - rowMedians

## UpperQuantilNorm
## From forum post https://bioinformatics.stackexchange.com/questions/2586/how-to-apply-upperquartile-normalization-on-rsem-expected-counts

## this uses all data in the quantile function, including those that are not expressed
## Assumes genes are rows and samples are columns
data.quantileAll <- apply(unnormalized_mtx, 2, function(x){quantile(x, 0.75)})

## this uses onlt the values that are greater than zero.
##data.quantileExpressed <- apply(unnormalized_mtx, 2, function(x){quantile(x[x>0], 0.75)});

data.norm <- t(t(unnormalized_mtx) / data.quantileAll)

## PAM50 Classification Using genefu() R Package
## Amy Olex
## PAM50 Classification of PDX samples.

library(genefu)

setwd("~/my/working/folder")

## import expression data
tpm <- read.delim("Gene_Expression_Data_TPMvalues.txt")

## Mapping Ensembl, Entrez, and Gene Symbol
library(AnnotationHub)
library(ensembldb)
ah <- AnnotationHub()
annot_h <- query(ah, patter=c("Homo","EnsDb", "87"))
EnsDb_h <- annot_h[[1]]
df_human_transcripts <- transcripts(EnsDb_h, return.type = "DataFrame")  ## pulls out transcript IDs
df_human_gene <- genes(EnsDb_h, return.type = "DataFrame")  ## pulls out gene IDs

## maxjobs
#!/bin/bash

### Author ###
#  Amy Olex
#  3/12/15
#  Function: maxjobs
#
### Description ###
# Essentially a helper function to control how many jobs get started at the same time.
# You will need to source this function in any script you wish to use it in.

## get_TCGA_PAM50_subtypes.R
## Instructions for working with subtypes in TCGABiolinks: https://www.bioconductor.org/packages/devel/bioc/vignettes/TCGAbiolinks/inst/doc/tcgaBiolinks.html#tcgaquery_subtype:_working_with_molecular_subtypes_data
## Note: the subtypes are from 2012 paper, so do not include ALL TCGA samples.

## https://support.bioconductor.org/p/91855/

source("http://www.bioconductor.org/biocLite.R")
library(TCGAbiolinks)

#### RNASeq Gene Expression
## query.gene <- GDCquery(project="TCGA-BRCA", sample.type="Primary solid Tumor", data.category="Transcriptome Profiling", data.type="Gene Expression Quantification", workflow.type="HTSeq - Counts")
	## Usage Function
	usage()
	{
	cat << EOF
	usage: $0 options
	This script is used to calculate the MD5SUM of all files in a given directory.

	REQUIRED ARGUMENTS:
	-f Input file
	## Make sure you have the development version of biomaRt installed. If not then run:
	## BiocInstaller::biocLite('grimbough/biomaRt')

	library(biomaRt)

	mart = useEnsembl("ENSEMBL_MART_ENSEMBL")
	mart=useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
	bmIDs = getBM(attributes=c('ensembl_gene_id','ensembl_transcript_id',
	+ 'description',
	+ 'chromosome_name',
	## Row median scale data
	rowMedians <- rowMedians(as.matrix(human_TPM_pam50_selected))
	human_TPM_pam50_selected_scaled <- human_TPM_pam50_selected - rowMedians
	## From forum post https://bioinformatics.stackexchange.com/questions/2586/how-to-apply-upperquartile-normalization-on-rsem-expected-counts

	## this uses all data in the quantile function, including those that are not expressed
	## Assumes genes are rows and samples are columns
	data.quantileAll <- apply(unnormalized_mtx, 2, function(x){quantile(x, 0.75)})

	## this uses onlt the values that are greater than zero.
	##data.quantileExpressed <- apply(unnormalized_mtx, 2, function(x){quantile(x[x>0], 0.75)});

	data.norm <- t(t(unnormalized_mtx) / data.quantileAll)
	## Amy Olex
	## PAM50 Classification of PDX samples.

	library(genefu)

	setwd("~/my/working/folder")

	## import expression data
	tpm <- read.delim("Gene_Expression_Data_TPMvalues.txt")
	library(AnnotationHub)
	library(ensembldb)
	ah <- AnnotationHub()
	annot_h <- query(ah, patter=c("Homo","EnsDb", "87"))
	EnsDb_h <- annot_h[[1]]
	df_human_transcripts <- transcripts(EnsDb_h, return.type = "DataFrame") ## pulls out transcript IDs
	df_human_gene <- genes(EnsDb_h, return.type = "DataFrame") ## pulls out gene IDs
	#!/bin/bash

	### Author ###
	# Amy Olex
	# 3/12/15
	# Function: maxjobs
	#
	### Description ###
	# Essentially a helper function to control how many jobs get started at the same time.
	# You will need to source this function in any script you wish to use it in.
	## Instructions for working with subtypes in TCGABiolinks: https://www.bioconductor.org/packages/devel/bioc/vignettes/TCGAbiolinks/inst/doc/tcgaBiolinks.html#tcgaquery_subtype:_working_with_molecular_subtypes_data
	## Note: the subtypes are from 2012 paper, so do not include ALL TCGA samples.

	## https://support.bioconductor.org/p/91855/

	source("http://www.bioconductor.org/biocLite.R")
	library(TCGAbiolinks)

	#### RNASeq Gene Expression
	## query.gene <- GDCquery(project="TCGA-BRCA", sample.type="Primary solid Tumor", data.category="Transcriptome Profiling", data.type="Gene Expression Quantification", workflow.type="HTSeq - Counts")