Mikhail Dozmorov mdozmorov

## gist_mm39_excluderanges.R
# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.

suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
# Construct output file name

## gist_T2T_excluderanges.R
# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.

suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "6548a002754cc1e882035293541b59a8"
# Construct output file name

## liftOver.R
# How to liftOver Paired BED data
library(dplyr)
library(tidyr)
library(readxl)
library(rtracklayer)

# Landscape of Cohesin-Mediated Chromatin Loops in the Human Genome
# https://www.nature.com/articles/s41586-020-2151-x
# Supplementary Table 4 | Pan-cell type cohesin-mediated chromatin loops, hg19 coordinates, paired BED data
url1 <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2151-x/MediaObjects/41586_2020_2151_MOESM5_ESM.xlsx"

## Tesseract quick intro
# Tesseract Intro: https://cran.r-project.org/web/packages/tesseract/vignettes/intro.html
# Image source: https://twitter.com/davidsuter_epfl/status/1141959559654846464?s=20

library(tesseract)
eng <- tesseract("eng")
text <- tesseract::ocr("https://pbs.twimg.com/media/D9kOESnW4AEs90l?format=jpg&name=900x900", engine = eng)
cat(text)

## SCSig collection EDA
# Exploratory data analysis of SCSig collection: Signatures of Single Cell Identities
# http://www.gsea-msigdb.org/gsea/msigdb/supplementary_genesets.jsp
library(dplyr)
library(clusterProfiler)
# Read in gene sets
mtx <- read.gmt("http://software.broadinstitute.org/gsea/msigdb/supplemental/scsig.all.v1.0.symbols.gmt")
# Number of unique gene signatures
mtx$ont %>% unique() %>% length()
# Summary statistics on size of gene signatures
mtx %>% group_by(ont) %>% summarise(size = n()) %>% select(size) %>% summary()

## check_phred.sh
# https://www.biostars.org/p/63225/

FILE=VLI9_AA_S60_L008_R1_001.fastq.gz

zcat $FILE | head -n 40 | awk '{if(NR%4==0) printf("%s",$0);}' |  od -A n -t u1 | awk 'BEGIN{min=100;max=0;}{for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END{if(max<=74 && min<59) print "Phred+33"; else if(max>73 && min>=64) print "Phred+64"; else if(min>=59 && min<64 && max>73) print "Solexa+64"; else print "Unknown score encoding";}'

## gist:3dda64fada07938159afa4532d8c1a63
# PCA: Check for batch effects. Select one batch, to color points by its assignment
pca <-  mtx %>% varFilter(., var.cutoff = 0.75) %>% scale %>% t %>% prcomp

colorby <- "Group" # covariates[2]
pt <- ggplot(data = data.frame(pca$x, annot),
             aes(x = as.numeric(PC1), y = as.numeric(PC2), label = Sample)) +
  theme(plot.title = element_text(lineheight = 0.8, face="bold")) +
  ggtitle(paste("PCA with batch, coloring by ", colorby)) +
  geom_point(aes(color = eval(parse(text = colorby))), size = 3) +
  geom_text_repel(colour = "black", size = 3) +

## Forbes2000.csv

          
            rank
            name
            country
            category
            sales
            profits
            assets
            marketvalue

            
              1
              Citigroup
              United States
              Banking
              94.71
              17.85
              1264.03
              255.3

            
              2
              General Electric
              United States
              Conglomerates
              134.19
              15.59
              626.93
              328.54

            
              3
              American Intl Group
              United States
              Insurance
              76.66
              6.46
              647.66
              194.87

            
              4
              ExxonMobil
              United States
              Oil & gas operations
              222.88
              20.96
              166.99
              277.02

            
              5
              BP
              United Kingdom
              Oil & gas operations
              232.57
              10.27
              177.57
              173.54

            
              6
              Bank of America
              United States
              Banking
              49.01
              10.81
              736.45
              117.55

            
              7
              HSBC Group
              United Kingdom
              Banking
              44.33
              6.66
              757.6
              177.96

            
              8
              Toyota Motor
              Japan
              Consumer durables
              135.82
              7.99
              171.71
              115.4

            
              9
              Fannie Mae
              United States
              Diversified financials
              53.13
              6.48
              1019.17
              76.84

## mtx_for_PCA.csv
"","E116.11BivFlnk.chromStates15","E116.14ReprPCWk.chromStates15","E116.15Quies.chromStates15","E116.1TssA.chromStates15","E116.2TssAFlnk.chromStates15","E116.3TxFlnk.chromStates15","E116.4Tx.chromStates15","E116.5TxWk.chromStates15","E116.6EnhG.chromStates15","E116.7Enh.chromStates15","E116.8ZNFRpts.chromStates15","E116.9Het.chromStates15","category"
"crohns_disease",0.162052705452119,0.0635253490922535,-19.2252181061205,1.06711427791428,9.90758142461494,0.00255822057746797,2.56100484438806,-0.253587329445552,3.38505679108121,11.2870094369073,2.23623723813231,2.68992814221344,"autoimmune"
"ulcerative_colitis",0.187763955873977,0.127638469530766,-13.9279062300709,2.52742536677685,2.98729538325691,2.42524539010519,1.11285397153932,0.594831983850718,-0.0732590594795718,4.09316926199843,0.887628082728305,3.77780261525578,"autoimmune"
"rheumatoid_arthritis",0.158691039640792,-0.0296551102513595,-15.3251476656711,1.55895110481963,15.8450458804273,0.547902010147141,0.337374925927234,1.52148872631903,1.5879471977881

## DEG_RNA-seq.R
# Source: Additional file 1 from Łabaj, Paweł P., and David P. Kreil. “Sensitivity, Specificity, and Reproducibility of RNA-Seq Differential Expression Calls.” Biology Direct 11, no. 1 (December 20, 2016): 66. doi:10.1186/s13062-016-0169-7.
# https://static-content.springer.com/esm/art%3A10.1186%2Fs13062-016-0169-7/MediaObjects/13062_2016_169_MOESM1_ESM.pdf

DEfun <- function(counts, design) {
    DE <- list()
    ## limma
    gene.dge <- DGEList(counts = counts, group = factor(rep(1:2, each = 4)))
    gene.dge.norm <- calcNormFactors(gene.dge)
    gene.dge.norm2 <- gene.dge.norm
    gene.dge.norm2$counts <- gene.dge.norm2$counts - 0.5
	# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
	# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
	# software, High Signal and Low Mappability regions.
	# See https://github.com/dozmorovlab/excluderanges for more information.

	suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
	suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
	# bedbase_id
	bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
	# Construct output file name
	# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
	# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
	# software, High Signal and Low Mappability regions.
	# See https://github.com/dozmorovlab/excluderanges for more information.

	suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
	suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
	# bedbase_id
	bedbase_id <- "6548a002754cc1e882035293541b59a8"
	# Construct output file name
	# How to liftOver Paired BED data
	library(dplyr)
	library(tidyr)
	library(readxl)
	library(rtracklayer)

	# Landscape of Cohesin-Mediated Chromatin Loops in the Human Genome
	# https://www.nature.com/articles/s41586-020-2151-x
	# Supplementary Table 4 \| Pan-cell type cohesin-mediated chromatin loops, hg19 coordinates, paired BED data
	url1 <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2151-x/MediaObjects/41586_2020_2151_MOESM5_ESM.xlsx"
	# Tesseract Intro: https://cran.r-project.org/web/packages/tesseract/vignettes/intro.html
	# Image source: https://twitter.com/davidsuter_epfl/status/1141959559654846464?s=20

	library(tesseract)
	eng <- tesseract("eng")
	text <- tesseract::ocr("https://pbs.twimg.com/media/D9kOESnW4AEs90l?format=jpg&name=900x900", engine = eng)
	cat(text)
	# Exploratory data analysis of SCSig collection: Signatures of Single Cell Identities
	# http://www.gsea-msigdb.org/gsea/msigdb/supplementary_genesets.jsp
	library(dplyr)
	library(clusterProfiler)
	# Read in gene sets
	mtx <- read.gmt("http://software.broadinstitute.org/gsea/msigdb/supplemental/scsig.all.v1.0.symbols.gmt")
	# Number of unique gene signatures
	mtx$ont %>% unique() %>% length()
	# Summary statistics on size of gene signatures
	mtx %>% group_by(ont) %>% summarise(size = n()) %>% select(size) %>% summary()
	# https://www.biostars.org/p/63225/

	FILE=VLI9_AA_S60_L008_R1_001.fastq.gz

	zcat $FILE \| head -n 40 \| awk '{if(NR%4==0) printf("%s",$0);}' \| od -A n -t u1 \| awk 'BEGIN{min=100;max=0;}{for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END{if(max<=74 && min<59) print "Phred+33"; else if(max>73 && min>=64) print "Phred+64"; else if(min>=59 && min<64 && max>73) print "Solexa+64"; else print "Unknown score encoding";}'
	# PCA: Check for batch effects. Select one batch, to color points by its assignment
	pca <- mtx %>% varFilter(., var.cutoff = 0.75) %>% scale %>% t %>% prcomp

	colorby <- "Group" # covariates[2]
	pt <- ggplot(data = data.frame(pca$x, annot),
	aes(x = as.numeric(PC1), y = as.numeric(PC2), label = Sample)) +
	theme(plot.title = element_text(lineheight = 0.8, face="bold")) +
	ggtitle(paste("PCA with batch, coloring by ", colorby)) +
	geom_point(aes(color = eval(parse(text = colorby))), size = 3) +
	geom_text_repel(colour = "black", size = 3) +
rank	name	country	category	sales	profits	assets	marketvalue
1	Citigroup	United States	Banking	94.71	17.85	1264.03	255.3
2	General Electric	United States	Conglomerates	134.19	15.59	626.93	328.54
3	American Intl Group	United States	Insurance	76.66	6.46	647.66	194.87
4	ExxonMobil	United States	Oil & gas operations	222.88	20.96	166.99	277.02
5	BP	United Kingdom	Oil & gas operations	232.57	10.27	177.57	173.54
6	Bank of America	United States	Banking	49.01	10.81	736.45	117.55
7	HSBC Group	United Kingdom	Banking	44.33	6.66	757.6	177.96
8	Toyota Motor	Japan	Consumer durables	135.82	7.99	171.71	115.4
9	Fannie Mae	United States	Diversified financials	53.13	6.48	1019.17	76.84
	"","E116.11BivFlnk.chromStates15","E116.14ReprPCWk.chromStates15","E116.15Quies.chromStates15","E116.1TssA.chromStates15","E116.2TssAFlnk.chromStates15","E116.3TxFlnk.chromStates15","E116.4Tx.chromStates15","E116.5TxWk.chromStates15","E116.6EnhG.chromStates15","E116.7Enh.chromStates15","E116.8ZNFRpts.chromStates15","E116.9Het.chromStates15","category"
	"crohns_disease",0.162052705452119,0.0635253490922535,-19.2252181061205,1.06711427791428,9.90758142461494,0.00255822057746797,2.56100484438806,-0.253587329445552,3.38505679108121,11.2870094369073,2.23623723813231,2.68992814221344,"autoimmune"
	"ulcerative_colitis",0.187763955873977,0.127638469530766,-13.9279062300709,2.52742536677685,2.98729538325691,2.42524539010519,1.11285397153932,0.594831983850718,-0.0732590594795718,4.09316926199843,0.887628082728305,3.77780261525578,"autoimmune"
	"rheumatoid_arthritis",0.158691039640792,-0.0296551102513595,-15.3251476656711,1.55895110481963,15.8450458804273,0.547902010147141,0.337374925927234,1.52148872631903,1.5879471977881
	# Source: Additional file 1 from Łabaj, Paweł P., and David P. Kreil. “Sensitivity, Specificity, and Reproducibility of RNA-Seq Differential Expression Calls.” Biology Direct 11, no. 1 (December 20, 2016): 66. doi:10.1186/s13062-016-0169-7.
	# https://static-content.springer.com/esm/art%3A10.1186%2Fs13062-016-0169-7/MediaObjects/13062_2016_169_MOESM1_ESM.pdf

	DEfun <- function(counts, design) {
	DE <- list()
	## limma
	gene.dge <- DGEList(counts = counts, group = factor(rep(1:2, each = 4)))
	gene.dge.norm <- calcNormFactors(gene.dge)
	gene.dge.norm2 <- gene.dge.norm
	gene.dge.norm2$counts <- gene.dge.norm2$counts - 0.5