lianchye/search_engine.R

## search_engine.R
#Install Keras
devtools::install_github("rstudio/keras")
library(keras)
install_keras()

#Install other packages for analysis etc..
list.of.packages = c("class", "ggsci","ggpubr","tidyverse","kernelKnn","doParallel")
new.packages     = list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Grab the github package
mapcell_zip_url   = "https://github.com/lianchye/mapcell/archive/master.zip"
download.file(url = mapcell_zip_url, destfile = "mapcell-master.zip")
unzip(zipfile = "mapcell-master.zip")
setwd(dir     = "./mapcell-master/")

# Load Trained Model for embedding and reference point to search
source("./annotation_functions/function_lib_annotation.R")

gene_names    = "./models_reference/human_cell_landscape/peri-blood/gene_names.rds" %>%  readRDS()
model_sel     = "./models_reference/human_cell_landscape/peri-blood/weights.h5"      %>% build_embedding(rna_length = length(gene_names),weights_h5 = .)
refCells_sel  = "./models_reference/human_cell_landscape/peri-blood/embed_ref.rds"   %>% readRDS()
metaCells_sel = "./models_reference/human_cell_landscape/peri-blood/meta_ref.rds"    %>% readRDS()

#Install Tabular Muris Bioconductor Version
BiocManager::install("SingleCellExperiment")
BiocManager::install("ExperimentHub")
BiocManager::install("TabulaMurisData")

suppressPackageStartupMessages({
  library(ExperimentHub)
  library(SingleCellExperiment)
  library(TabulaMurisData)
})

# Get tabularmuris data... Data will be cached, first time will take sometime
eh                = ExperimentHub()
query(eh, "TabulaMurisData")
droplet           = eh[["EH1617"]]
rownames(droplet) = rownames(droplet) %>% toupper()
cell_batch_size   = 2500
chunks            = split(1:ncol(droplet), ceiling(seq_along(1:ncol(droplet))/cell_batch_size))
common_genes      = intersect(gene_names %>% toupper(),rownames(droplet))
neighbors         = 20

out.list=list()
for(chunk_ind in 1:length(chunks)){

  cat(chunk_ind)

  ext_data.ct     = matrix(0,nrow = length(gene_names), ncol = length(chunks[[chunk_ind]]))
  rownames(ext_data.ct) = gene_names
  colnames(ext_data.ct) = colnames(droplet)[chunks[[chunk_ind]]]

  ext_data.ct[common_genes,]=droplet[common_genes,chunks[[chunk_ind]]]@assays[["counts"]] %>% as.matrix()

  ext_data.embed = model_sel %>% predict(ext_data.ct %>% apply(., 2, function(x){x/max(x)}) %>% as.matrix() %>% t(),verbose=1)
  knn_index_out  = KernelKnn::knn.index.dist(refCells_sel,ext_data.embed,k=neighbors,method = "pearson_correlation",threads = 14)

  snn_annot.df   =
    data.frame(cell_bc     = rep(colnames(ext_data.ct),each=neighbors),
               knn_annot   = knn_index_out$test_knn_idx %>% t() %>% as.vector() %>% metaCells_sel$Celltype_clean[.],
               knn_dist    = knn_index_out$test_knn_dist%>% t() %>% as.vector(),
               stringsAsFactors = FALSE)

  snn_annot.df =
    snn_annot.df %>%
    left_join(
      snn_annot.df %>%
        group_by(cell_bc,knn_annot) %>% tally() %>%
        group_by(cell_bc) %>% slice(which.max(n)) %>% ungroup() %>%
        rename(annot_call=knn_annot,annot_cts=n))

  out.list[[chunk_ind]]=snn_annot.df
}

out.df=out.list %>% do.call("rbind",.) %>%
  left_join(droplet@colData %>% data.frame(),by=c("cell_bc"="cell")) %>%
  filter(cell_ontology_class!="NA") %>%
  group_by(cell_bc) %>% top_n(n = 1,wt=-knn_dist)
	#Install Keras
	devtools::install_github("rstudio/keras")
	library(keras)
	install_keras()

	#Install other packages for analysis etc..
	list.of.packages = c("class", "ggsci","ggpubr","tidyverse","kernelKnn","doParallel")
	new.packages = list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
	if(length(new.packages)) install.packages(new.packages)

	# Grab the github package
	mapcell_zip_url = "https://github.com/lianchye/mapcell/archive/master.zip"
	download.file(url = mapcell_zip_url, destfile = "mapcell-master.zip")
	unzip(zipfile = "mapcell-master.zip")
	setwd(dir = "./mapcell-master/")

	# Load Trained Model for embedding and reference point to search
	source("./annotation_functions/function_lib_annotation.R")

	gene_names = "./models_reference/human_cell_landscape/peri-blood/gene_names.rds" %>% readRDS()
	model_sel = "./models_reference/human_cell_landscape/peri-blood/weights.h5" %>% build_embedding(rna_length = length(gene_names),weights_h5 = .)
	refCells_sel = "./models_reference/human_cell_landscape/peri-blood/embed_ref.rds" %>% readRDS()
	metaCells_sel = "./models_reference/human_cell_landscape/peri-blood/meta_ref.rds" %>% readRDS()

	#Install Tabular Muris Bioconductor Version
	BiocManager::install("SingleCellExperiment")
	BiocManager::install("ExperimentHub")
	BiocManager::install("TabulaMurisData")

	suppressPackageStartupMessages({
	library(ExperimentHub)
	library(SingleCellExperiment)
	library(TabulaMurisData)
	})

	# Get tabularmuris data... Data will be cached, first time will take sometime
	eh = ExperimentHub()
	query(eh, "TabulaMurisData")
	droplet = eh[["EH1617"]]
	rownames(droplet) = rownames(droplet) %>% toupper()
	cell_batch_size = 2500
	chunks = split(1:ncol(droplet), ceiling(seq_along(1:ncol(droplet))/cell_batch_size))
	common_genes = intersect(gene_names %>% toupper(),rownames(droplet))
	neighbors = 20

	out.list=list()
	for(chunk_ind in 1:length(chunks)){

	cat(chunk_ind)

	ext_data.ct = matrix(0,nrow = length(gene_names), ncol = length(chunks[[chunk_ind]]))
	rownames(ext_data.ct) = gene_names
	colnames(ext_data.ct) = colnames(droplet)[chunks[[chunk_ind]]]

	ext_data.ct[common_genes,]=droplet[common_genes,chunks[[chunk_ind]]]@assays[["counts"]] %>% as.matrix()

	ext_data.embed = model_sel %>% predict(ext_data.ct %>% apply(., 2, function(x){x/max(x)}) %>% as.matrix() %>% t(),verbose=1)
	knn_index_out = KernelKnn::knn.index.dist(refCells_sel,ext_data.embed,k=neighbors,method = "pearson_correlation",threads = 14)

	snn_annot.df =
	data.frame(cell_bc = rep(colnames(ext_data.ct),each=neighbors),
	knn_annot = knn_index_out$test_knn_idx %>% t() %>% as.vector() %>% metaCells_sel$Celltype_clean[.],
	knn_dist = knn_index_out$test_knn_dist%>% t() %>% as.vector(),
	stringsAsFactors = FALSE)

	snn_annot.df =
	snn_annot.df %>%
	left_join(
	snn_annot.df %>%
	group_by(cell_bc,knn_annot) %>% tally() %>%
	group_by(cell_bc) %>% slice(which.max(n)) %>% ungroup() %>%
	rename(annot_call=knn_annot,annot_cts=n))

	out.list[[chunk_ind]]=snn_annot.df
	}

	out.df=out.list %>% do.call("rbind",.) %>%
	left_join(droplet@colData %>% data.frame(),by=c("cell_bc"="cell")) %>%
	filter(cell_ontology_class!="NA") %>%
	group_by(cell_bc) %>% top_n(n = 1,wt=-knn_dist)