Skip to content

Instantly share code, notes, and snippets.

View mdozmorov's full-sized avatar

Mikhail Dozmorov mdozmorov

View GitHub Profile
@mdozmorov
mdozmorov / gist_mm39_excluderanges.R
Created September 19, 2022 23:39
mm39 excluderanges download
# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
# Construct output file name
@mdozmorov
mdozmorov / gist_T2T_excluderanges.R
Last active September 19, 2022 19:59
T2T excluderanges download
# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "6548a002754cc1e882035293541b59a8"
# Construct output file name
@mdozmorov
mdozmorov / liftOver.R
Created November 27, 2020 01:33
How to liftOver Paired BED data
# How to liftOver Paired BED data
library(dplyr)
library(tidyr)
library(readxl)
library(rtracklayer)
# Landscape of Cohesin-Mediated Chromatin Loops in the Human Genome
# https://www.nature.com/articles/s41586-020-2151-x
# Supplementary Table 4 | Pan-cell type cohesin-mediated chromatin loops, hg19 coordinates, paired BED data
url1 <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2151-x/MediaObjects/41586_2020_2151_MOESM5_ESM.xlsx"
# Tesseract Intro: https://cran.r-project.org/web/packages/tesseract/vignettes/intro.html
# Image source: https://twitter.com/davidsuter_epfl/status/1141959559654846464?s=20
library(tesseract)
eng <- tesseract("eng")
text <- tesseract::ocr("https://pbs.twimg.com/media/D9kOESnW4AEs90l?format=jpg&name=900x900", engine = eng)
cat(text)
# Exploratory data analysis of SCSig collection: Signatures of Single Cell Identities
# http://www.gsea-msigdb.org/gsea/msigdb/supplementary_genesets.jsp
library(dplyr)
library(clusterProfiler)
# Read in gene sets
mtx <- read.gmt("http://software.broadinstitute.org/gsea/msigdb/supplemental/scsig.all.v1.0.symbols.gmt")
# Number of unique gene signatures
mtx$ont %>% unique() %>% length()
# Summary statistics on size of gene signatures
mtx %>% group_by(ont) %>% summarise(size = n()) %>% select(size) %>% summary()
@mdozmorov
mdozmorov / check_phred.sh
Created May 18, 2019 01:55
Check Phred offset
# https://www.biostars.org/p/63225/
FILE=VLI9_AA_S60_L008_R1_001.fastq.gz
zcat $FILE | head -n 40 | awk '{if(NR%4==0) printf("%s",$0);}' | od -A n -t u1 | awk 'BEGIN{min=100;max=0;}{for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END{if(max<=74 && min<59) print "Phred+33"; else if(max>73 && min>=64) print "Phred+64"; else if(min>=59 && min<64 && max>73) print "Solexa+64"; else print "Unknown score encoding";}'
# PCA: Check for batch effects. Select one batch, to color points by its assignment
pca <- mtx %>% varFilter(., var.cutoff = 0.75) %>% scale %>% t %>% prcomp
colorby <- "Group" # covariates[2]
pt <- ggplot(data = data.frame(pca$x, annot),
aes(x = as.numeric(PC1), y = as.numeric(PC2), label = Sample)) +
theme(plot.title = element_text(lineheight = 0.8, face="bold")) +
ggtitle(paste("PCA with batch, coloring by ", colorby)) +
geom_point(aes(color = eval(parse(text = colorby))), size = 3) +
geom_text_repel(colour = "black", size = 3) +
rank name country category sales profits assets marketvalue
1 Citigroup United States Banking 94.71 17.85 1264.03 255.3
2 General Electric United States Conglomerates 134.19 15.59 626.93 328.54
3 American Intl Group United States Insurance 76.66 6.46 647.66 194.87
4 ExxonMobil United States Oil & gas operations 222.88 20.96 166.99 277.02
5 BP United Kingdom Oil & gas operations 232.57 10.27 177.57 173.54
6 Bank of America United States Banking 49.01 10.81 736.45 117.55
7 HSBC Group United Kingdom Banking 44.33 6.66 757.6 177.96
8 Toyota Motor Japan Consumer durables 135.82 7.99 171.71 115.4
9 Fannie Mae United States Diversified financials 53.13 6.48 1019.17 76.84
"","E116.11BivFlnk.chromStates15","E116.14ReprPCWk.chromStates15","E116.15Quies.chromStates15","E116.1TssA.chromStates15","E116.2TssAFlnk.chromStates15","E116.3TxFlnk.chromStates15","E116.4Tx.chromStates15","E116.5TxWk.chromStates15","E116.6EnhG.chromStates15","E116.7Enh.chromStates15","E116.8ZNFRpts.chromStates15","E116.9Het.chromStates15","category"
"crohns_disease",0.162052705452119,0.0635253490922535,-19.2252181061205,1.06711427791428,9.90758142461494,0.00255822057746797,2.56100484438806,-0.253587329445552,3.38505679108121,11.2870094369073,2.23623723813231,2.68992814221344,"autoimmune"
"ulcerative_colitis",0.187763955873977,0.127638469530766,-13.9279062300709,2.52742536677685,2.98729538325691,2.42524539010519,1.11285397153932,0.594831983850718,-0.0732590594795718,4.09316926199843,0.887628082728305,3.77780261525578,"autoimmune"
"rheumatoid_arthritis",0.158691039640792,-0.0296551102513595,-15.3251476656711,1.55895110481963,15.8450458804273,0.547902010147141,0.337374925927234,1.52148872631903,1.5879471977881
@mdozmorov
mdozmorov / DEG_RNA-seq.R
Created February 2, 2017 02:01
Differential expression analysis in RNA-seq
# Source: Additional file 1 from Łabaj, Paweł P., and David P. Kreil. “Sensitivity, Specificity, and Reproducibility of RNA-Seq Differential Expression Calls.” Biology Direct 11, no. 1 (December 20, 2016): 66. doi:10.1186/s13062-016-0169-7.
# https://static-content.springer.com/esm/art%3A10.1186%2Fs13062-016-0169-7/MediaObjects/13062_2016_169_MOESM1_ESM.pdf
DEfun <- function(counts, design) {
DE <- list()
## limma
gene.dge <- DGEList(counts = counts, group = factor(rep(1:2, each = 4)))
gene.dge.norm <- calcNormFactors(gene.dge)
gene.dge.norm2 <- gene.dge.norm
gene.dge.norm2$counts <- gene.dge.norm2$counts - 0.5