klprint/sce_helper.R

## sce_helper.R
library(hdf5r)
library(Matrix)
source("https://gist.githubusercontent.com/klprint/1ab4468eb3c54abcf0422dec6223b8fc/raw/b4cc33f5b4da25bcc2e678cf46b692fe67605460/single_cell_functions.R")
library(SingleCellExperiment)
library(tidyverse)
library(liger)

sce.raw.pca <- function(sce, k = 100){
  umi <- assay(sce, "umi")

  cat("Getting informative genes\n")
  info.genes <- get.info.genes(umi)

  cat("\tSetting info genes\n")
  umi <- umi[info.genes, ]

  cat("Normalization\n")
  nrm <- t(umi) / colSums(umi)

  cat("Running PCA\n")
  pca <- irlba::prcomp_irlba(nrm, n = k)$x
  rownames(pca) <- colnames(umi)

  reducedDim(sce, "raw_pca") <- pca

  return(sce)
}

sce.split.by.col <- function(sco, column, min.cells = 1, n.cores = 10){
  lvls <- unique(colData(sco)[,column])

  sco <- sco[,!is.na(colData(sco)[,column])]

  out <- pbmcapply::pbmclapply(lvls, function(x){
    sel <- colData(sco)[,column] == x

    if(sum(sel, na.rm = T) <= min.cells) return(NA)

    sco[,sel]
  }, mc.cores = n.cores)

  names(out) <- lvls
  out <- out[!is.na(out)]
  return(out)
}

sce.run.liger <- function(sce.list, k = 25, assay.slot = "umi", plot.genes=F){
  require(liger)
  umi <- lapply(sce.list, function(x) assay(x, assay.slot))

  cat("Creating LIGER object\n")
  l = createLiger(umi)

  cat("Preprocessing LIGER\n")
  l = liger::normalize(l)
  l = liger::selectGenes(l, do.plot = plot.genes)
  l = liger::scaleNotCenter(l)


  cat("Running LIGER\n")
  l = optimizeALS(l, k = k)

  return(l)
}

run.umap <- function(X, n_components = 3, min_dist = .1, n_neighbors = 15, ret_nn = F){
  set.seed(1234)
  uwot::umap(
    X,
    metric = "cosine",
    min_dist = min_dist,
    n_neighbors = n_neighbors,
    verbose = T,
    n_components = n_components,
    ret_nn = ret_nn
  )
}

sce.run.umap <- function(sce, use.reduction = "liger", n_components = 3, min_dist = .1, n_neighbors = 15){
  out <- run.umap(reducedDim(sce, use.reduction), n_components = n_components, min_dist = min_dist, n_neighbors = n_neighbors)
  rownames(out) <- rownames(reducedDim(sce, use.reduction))

  reducedDim(sce, paste0("umap", n_components, "d")) <- out
  return(sce)
}

liger.to.sce <- function(liger.object, input.sce.list){
  out <- SingleCellExperiment(
    assays = list(umi = do.call(merge.sparse, liger.object@raw.data)),
    colData = lapply(input.sce.list, colData) %>% do.call(rbind,.),
    reducedDims = list(
      liger = do.call(rbind, liger.object@H)
    )
  )

  return(out)
}

sce.pointcloud.cat <- function(sce, reduceDimName, categoryName){
  emb <- reducedDim(sce, reduceDimName)
  ctg <- colData(sce)[,categoryName]
  ctg[is.na(ctg)] <- "ND"

  emb <- emb[order(ctg), ]
  ctg <- sort(ctg)
  babyplots::pointCloud(emb, "categories", ctg)
}

sce.pointcloud.gene <- function(sce, reduceDimName, gene, assay="umi"){
  emb <- reducedDim(sce, reduceDimName)

  val <- log1p((assay(ct.comb["ENSG00000075891", ], assay) / colSums( assay(ct.comb, assay)))[1,])


  babyplots::pointCloud(emb, "values", val)
}

sce.pointcloud.val <- function(sce, reduceDimName, val){
  emb <- reducedDim(sce, reduceDimName)
  babyplots::pointCloud(emb, "values", val)
}

sce.pseudobulk <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10, subsample.to = NULL){
  splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells)
  #print(splitted)
  if(length(splitted) == 0){
      return(NA)
  }
  pb <- pbmcapply::pbmclapply(splitted, function(x) {
    if(!is.null(subsample.to)){
      if(ncol(x) > subsample.to){
        set.seed(1234)
        x <- x[,sample(colnames(x), subsample.to, replace = F)]
      }
    }
    rowSums(assay(x, assay))
  }, mc.cores = mc.cores) %>%
    do.call(cbind, .)

  colnames(pb) <- names(splitted)
  return(pb)
}

sce.pseudobulk.center <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10, subsample.to = NULL){
  splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells)
  #print(splitted)
  if(length(splitted) == 0){
    return(NA)
  }
  pb <- pbmcapply::pbmclapply(splitted, function(x) {
    if(!is.null(subsample.to)){
      if(ncol(x) > subsample.to){
        set.seed(1234)
        x <- x[,sample(colnames(x), subsample.to, replace = F)]
      }
    }
    rowMeans(assay(x, assay))
  }, mc.cores = mc.cores) %>%
    do.call(cbind, .)

  colnames(pb) <- names(splitted)
  return(pb)
}

sce.to.anndata <- function(sce, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
  cat("Creating AnnData Object\n")
  sc <- import("scanpy")

  ad <- sc$AnnData(X = t(assay(sce, assay)),
                   obs = as.data.frame(colData(sce)),
                   obsm = reducedDims(sce)@listData[!startsWith(reducedDimNames(sce), "X_")])

  print(ad)


  sc$pp$neighbors(ad, n_neighbors = as.integer(n_neighbors), use_rep = neighbors_from, n_pcs = as.integer(n_pcs))


  return(ad)
}

sce.diffmap <- function(sce, n_diff_comps = 5, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
  sc <- import("scanpy")
  ad <- sce.to.anndata(sce,
                       n_neighbors = as.integer(n_neighbors),
                       neighbors_from = neighbors_from,
                       n_pcs = as.integer(n_pcs),
                       assay = assay)


  cat("Running diffusion map\n")
  sc$tl$diffmap(ad, n_comps = as.integer(n_diff_comps))

  sce@metadata$ad <- ad$copy()

  reducedDim(sce, "X_diffmap") <- ad$obsm["X_diffmap"]
  return(sce)
}

sce.dpt <- function(sce, root_cell_index, n_dcs = 5){
  stopifnot("X_diffmap" %in% reducedDimNames(sce))

  sc <- import("scanpy")
  ad <- sce@metadata$ad

  ad$uns["iroot"] <- as.integer(root_cell_index-1)

  cat("Running DPT\n")
  sc$tl$dpt(ad, n_dcs = as.integer(n_dcs))

  sce$dpt <- ad$obs["dpt_pseudotime"][,1]
  sce$dpt_rank <- rank(sce$dpt, ties.method = "first")
  sce$dpt_rank <- sce$dpt_rank - min(sce$dpt_rank)
  sce$dpt_rank <- sce$dpt_rank / max(sce$dpt_rank)

  return(sce)
}

sce.plot.3d <- function(sce, val, redDim = "umap3d", width = NULL, height=NULL){
  coord <- reducedDim(sce, redDim)[,1:3]

  if(val %in% colnames(colData(sce))){
      value = colData(sce)[,val]
  }else if(val %in% rownames(sce)){
      value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else{
      stop(paste0("Could not find \"", val, "\" in the object!"))
  }

  val = value

  if(is.numeric(val)){
    val[is.na(val)] <- 0
    babyplots::pointCloud(
      coord,
      "values",
      val,
      width = width,
      height = height
    )
  }else if(is.factor(val) | is.character(val)){
    val[is.na(val)] <- "NA"
    coord <- coord[order(val), ]
    val <- sort(val)
    babyplots::pointCloud(
      coord,
      "categories",
      val,
      width = width,
      height = height
    )
  }else{
    stop("val is neither numeric, nor a character or factor!")
  }
}

sce.plot.2d <- function(sce,
                        val,
                        redDim = "umap2d",
                        comps = 1:2,
                        point_size=.1,
                        alpha = 1,
                        with_repel = F,
                        repel_size=3,
                        do.sort=F,
                        force.discrete = F,
                        color.palette.vector = c("gray", rev(viridis::inferno(100))),
                        color.cubehelix = F,
                        color.inferno = F){
  coord <- reducedDim(sce, redDim)[,comps]

  if(color.cubehelix){
      color.palette.vector <- rev(rje::cubeHelix(100))
  }

  if(color.inferno){
      color.palette.vector <- viridis::inferno(100)
  }

  val.orig <- val

  has.rowdata <- ifelse(!is.null(ncol(rowData(sce))),
                        ifelse("gene_name" %in% colnames(rowData(sce)),
                               ifelse(val %in% rowData(sce)[,"gene_name"],
                                      T,
                                      F),
                               F),
                        F)

  if(length(val) == ncol(sce)){
      value = val
  }else if(val %in% colnames(colData(sce))){
      value = colData(sce)[,val]
  }else if(val %in% rownames(sce)){
      value = sqrt(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else if( has.rowdata ){
      val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
      print(val)
      value = sqrt(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else{
      stop(paste0("Could not find \"", val, "\" in the object!"))
  }

  if(force.discrete) value <- factor(as.character(value), levels = as.character(sort(unique(value))))

  dat <-  tibble(
    cell_id = rownames(coord),
    x = coord[,1],
    y = coord[,2]
  ) %>%
    left_join(
      tibble(
        cell_id = rownames(colData(sce)),
        val = value
      ),
        by="cell_id"
    ) %>% ungroup() %>% sample_n(n())


  if(is.numeric(value)){
    if(do.sort){
      dat <- dat %>% arrange(val)
    }
    p <- dat %>%
      ggplot(aes(x,y,color = val)) +
      geom_point(size=point_size, alpha = alpha)

    if(!is.null(color.palette.vector)){
        p <- p +
            scale_color_gradientn(name=val.orig, colors = color.palette.vector)
    }else{
        p <- p +
          scale_color_continuous(name = val.orig, low="gray", high="red")
    }

  }else{
    p <- dat %>%
      ggplot(aes(x,y,color = val)) +
      geom_point(size=point_size)
    # p <- p +
    #   scale_color_discrete(name=val.orig) +
    #   guides(color = guide_legend(override.aes = aes(size = 2)))
    #
    if(!is.null(color.palette.vector)){
      p <- p + scale_color_manual(name=val.orig,
                                  values = color.palette.vector, na.value="#808080") +
        guides(color = guide_legend(override.aes = aes(size = 2)))
    }else{
      p <- p +
          scale_color_discrete(name=val.orig) +
          guides(color = guide_legend(override.aes = aes(size = 2)))
    }

  }

  if(with_repel){
    p <- p + ggrepel::geom_label_repel(
      data = function(x){
        x %>% group_by(val) %>%
          summarise(x = median(x),
                    y = median(y))
      },
      mapping = aes(label = val), size = repel_size
    ) +
      theme(legend.position = "none")
  }

  return(p)
}


sce.run.liger.full = function(sce, min.cells = 50, k=25, split.by = "batch", n.cores=10){

    if(!is.na(ncol(rowData(sce)))) row.data = rowData(sce)

    cat("Splitting\n")
    data.list = sce.split.by.col(sce, split.by, min.cells = min.cells, n.cores = n.cores)
    cat("Running liger\n")
    data.liger = sce.run.liger(data.list,k=k)
    cat("Combining to SCE object\n")
    out = liger.to.sce(data.liger, data.list)


    tmp.rd = matrix(NA, ncol=k, nrow=nrow(out), dimnames = list(rownames(out), paste0("liger_", 1:k)))
    tmp.rd[colnames(data.liger@W), ] = t(data.liger@W)
    tmp.rd = as.data.frame(tmp.rd)

    if(!is.na(ncol(rowData(sce)))){
      rowData(out) <- tmp.rd %>%
        rownames_to_column("gene_id") %>%
        left_join(
          as.data.frame(row.data) %>%
            rownames_to_column("gene_id")
        ) %>%
        as.data.frame() %>%
        column_to_rownames("gene_id")
    }else{
      rowData(out) = tmp.rd
    }


    cat("UMAP")
    out = sce.run.umap(out, n_components = 2)


    return(out)
}

sce.liger.pheatmap = function(sce, order.col.by = "dpt", annot.cols = NULL){
    annot = NULL

    if(!is.null(annot.cols)){
        annot = as.data.frame(colData(sce)[,annot.cols])
        rownames(annot) <- colnames(sce)
        colnames(annot) <- annot.cols
    }
    print(head(annot))

    ph.data = t(reducedDim(sce, "liger")[order(colData(sce)[,order.col.by]), ])
    rownames(ph.data) = paste0("liger_", 1:nrow(ph.data))

    print(head(ph.data)[,1:5])

    pheatmap::pheatmap(
        ph.data,
        show_rownames=T,
        show_colnames=F,
        cluster_cols=F,
        clustering_distance_rows="correlation",
        color=colorRampPalette(c("white", "black"))(100),
        annotation_col = annot
    )
}

sce.subsample.per.group <- function(sce, group.by, n.cells=1000, min.cells = 25, allow.min = F,
                                    mc.ncores = 10){

  sce.list <- sce.split.by.col(sce, column = group.by, min.cells = min.cells, n.cores = mc.ncores)

  if(!allow.min) sce.list <- sce.list[lapply(sce.list, ncol) %>% do.call("c", .) >= n.cells]

  sce.list <- lapply(sce.list, function(x){
    if(ncol(x) > n.cells){
      set.seed(1234)
      x <- x[,sample(colnames(x), n.cells, replace = F)]
    }

    return(x)
  })

  out <- do.call(cbind, sce.list)
  return(out)
}

sce.gene.enrichment <- function(sce,
                                in.group,
                                label,
                                balance.group = NULL,
                                do.balance = F,
                                subsample.to = 500,
                                allow.min.cells = F,
                                min.cells = 25,
                                mc.cores=10,
                                use.hvg = T,
                                frac.min=.25){


  ## Subsampling, if wanted
  if(do.balance){
    sce <- sce.subsample.per.group(sce,
                                   group.by = in.group,
                                   n.cells = subsample.to,
                                   min.cells = min.cells,
                                   allow.min = allow.min.cells,
                                   mc.ncores = mc.cores)
  }
  ##

  umi <- assay(sce, "umi")
  sf <- colSums(umi)
  if(use.hvg) umi <- umi[get.info.genes(umi), ]

  group.labels <- colData(sce)[,in.group]
  are.ingroup <- group.labels %in% label

  test.genes <- rowSums( umi[,are.ingroup, drop=F] > 0 ) > floor(frac.min * sum(are.ingroup))

  umi <- umi[test.genes, ]

  out <- pbmcapply::pbmclapply(rownames(umi), function(x){
    tibble(
      gene_id = x,
      log2fc = log2(mean( log1p(umi[x,are.ingroup] / sf[are.ingroup])) / mean(log1p(umi[x,!are.ingroup] / sf[!are.ingroup]))),
      p.value = wilcox.test(x = log1p(umi[x,are.ingroup] / sf[are.ingroup]), y =  log1p(umi[x,!are.ingroup] / sf[!are.ingroup]), alternative = "greater" )$p.value
    )
  }, mc.cores = mc.cores) %>%
    do.call(rbind, .)

  out$p.adjust = p.adjust(out$p.value, method = "BH")

  if(!is.null(ncol(rowData(sce)))){
    if("gene_name" %in% colnames(rowData(sce))){
      out <- out %>% left_join(
        as.data.frame(rowData(sce)) %>%
          rownames_to_column("gene_id") %>%
          select("gene_id", "gene_name")
      )
    }
  }

  return(out %>% arrange(desc(log2fc)))
}

sce.all.markers <- function(sce,
                            in.group,
                            balance.group = NULL,
                            do.balance = F,
                            subsample.to = 500,
                            allow.min.cells = F,
                            min.cells = 25,
                            mc.cores=10,
                            use.hvg = T,
                            frac.min=.25){

  lvls <- unique(colData(sce)[,in.group])

  out <- lapply(lvls, function(i){
    cat("Finding genes for ", i, "\n")
    sce.gene.enrichment(sce = sce,
                        in.group = in.group,
                        label = i,
                        balance.group = balance.group,
                        do.balance = do.balance,
                        subsample.to = subsample.to,
                        allow.min.cells = allow.min.cells,
                        min.cells = min.cells,
                        mc.cores=mc.cores,
                        use.hvg = use.hvg,
                        frac.min=frac.min)
  })
  names(out) <- lvls

  return(out)

}

sce.plot.along <- function(sce, val, along = "stage.ord", as.boxplot = F, as.violin = F,
                           pointsize = .1, color=NULL){
  val.orig <- val

  has.rowdata <- ifelse(!is.null(ncol(rowData(sce))),
                        ifelse("gene_name" %in% colnames(rowData(sce)),
                               ifelse(val %in% rowData(sce)[,"gene_name"],
                                      T,
                                      F),
                               F),
                        F)

  if(length(val) == ncol(sce)){
    value = val
  }else if(val %in% rownames(sce)){
    value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else if( has.rowdata ){
    val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
    print(val)
    value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else{
    stop(paste0("Could not find \"", val, "\" in the object!"))
  }

  if(!is.null(color)){
    df <- tibble(
      cell_id = colnames(sce),
      exprs = value
    ) %>%
      left_join(
        as.data.frame(colData(sce)) %>%
          rownames_to_column("cell_id")
      ) %>%
      select(exprs, along, color) %>%
      dplyr::rename(x = along,
                    group = color)
  }else{
    df <- tibble(
      cell_id = colnames(sce),
      exprs = value
    ) %>%
      left_join(
        as.data.frame(colData(sce)) %>%
          rownames_to_column("cell_id")
      ) %>%
      select(exprs, along, color) %>%
      dplyr::rename(x = along)
  }


  plt <- df %>%
    ggplot(aes(x=x, y=exprs)) +
    xlab(along)

  if(as.boxplot){
    plt <- plt + geom_boxplot()
  }else if(as.violin){
    plt <- plt + geom_violin()
  }else if(!is.null(color)){
    plt <- plt + geom_point(aes(color=group),
                            size=pointsize)
  }else{
    plt <- plt + geom_point(size=pointsize)
  }

  if(!is.numeric(colData(sce)[,along])){
    plt <- plt+
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5))
  }

  return(plt)

}

sce.get.gene.exprs <- function(sce, gene, umi=F){
  out <- NULL
  gene.orig <- gene

  if(!is.null(ncol(rowData(sce)))){
    if("gene_name" %in% colnames(rowData(sce))){
      if(gene %in% rowData(sce)[,"gene_name"]){
        gene <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)]
      }
    }
  }

  if(gene %in% rownames(sce)){
    if(!umi){
      out <- sqrt(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi")))
      #out <- scale(sqrt(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi"))))[,1]
    }else{
      out <- assay(sce, "umi")[gene,]
    }

  }

  return(out)
}

sce.get.exprs.df <- function(sce, genes){
  lapply(genes, function(g){

    exprs <- sce.get.gene.exprs(sce, g, umi = T)

    if(is.null(exprs)){
      return(NULL)
    }else{
      sce.colData(sce) %>%
        as_tibble() %>%
        left_join(
          tibble(
            cell_id = colnames(sce),
            size_factor = colSums(assay(sce, "umi"))
          )
        ) %>%
        left_join(
          tibble(
            cell_id = names(exprs),
            umi = exprs,
            gene = g
          )
        )
    }

  }) %>% do.call(rbind, .)
}

# Plotting expression polished:
sce.plot.polished.exprs <- function(sce,
                                    gn,
                                    reducedDim_name = "umap2d",
                                    color = "cell_type",
                                    dims = 1:2,
                                    dim_names = "UMAP",
                                    color_vec = c("gray", rev(viridis::inferno(100)))){

    ex = sce.get.exprs.df(sce, gn) %>%
        mutate(exprs = log1p(umi / size_factor * 1e6))

    rd <- reducedDim(sce, reducedDim_name)[,dims] %>%
        as.data.frame() %>%
        rownames_to_column("cell_id") %>%
        as_tibble()

    colnames(rd) <- c("cell_id", "d1", "d2")

    colnames(ex)[colnames(ex) == color] <- "group"

    ex %>%
        left_join(rd, by="cell_id") %>%
        ungroup() %>%
        sample_n(n()) %>%
        filter(group != "NA") %>%
        ggplot(aes(d1, d2)) +
        geom_point(aes(color=exprs),size=.01, alpha=.5) +
        #scale_color_viridis_c("expression", option="A") +
        scale_color_gradientn(colors = color_vec) +
        #new_scale_color() +
        ggrepel::geom_label_repel(
            data = function(x){
                x %>%
                    group_by(group) %>%
                    mutate(prop = sum(umi>0) / n()) %>%
                    filter(umi > 0) %>%
                    #filter(n() >= 10) %>%
                    summarise(d1 = median(d1),
                              d2 = median(d2),
                              prop = unique(prop)) %>%
                    ungroup() %>%
                    top_n(3, prop) %>%
                    filter(prop >= .01) %>%
                    mutate(prop = round(prop, 2))
            },
            mapping=aes(size = prop,
                        label = group),
            #size=3,
            force = 5,
            nudge_x = 2,
            nudge_y = 2
        ) +
        labs(x = paste0(dim_names, dims[1]),
             y = paste0(dim_names, dims[2])) +
        scale_size(range = c(4, 7)) +
        guides(size = guide_legend("proportion")) +
        ggtitle(gn) +
        #theme_dark()  +
        theme(panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
              panel.background = element_rect(fill="white"), #gainsboro
              axis.text = element_blank(),
              axis.ticks = element_blank(),
              axis.title = element_text(size=15),
              plot.title = element_text(size=18),
              legend.title = element_text(size=15),
              legend.text = element_text(size=13)) +
        coord_fixed()
}

# Depreciated
sce.dotplot <- function(sce, genes, group.by = "cell_type"){
  tmp <- lapply(genes, function(g){

    exprs <- sce.get.gene.exprs(sce, g, umi = T)

    if(is.null(exprs)){
      return(NULL)
    }else{
      colData(sce) %>%
        as_tibble() %>%
        add_column(umi = exprs) %>%
        mutate(exprs = umi / colSums(assay(sce, "umi"))) %>%
        mutate(expresses = ifelse(umi > 0 , 1, 0)) %>%
        dplyr::rename(gr = group.by) %>%
        group_by(gr) %>%
        summarise(exprs = mean(exprs),
                  expresses = sum(expresses) / n()) %>%
        add_column(gene = g) %>%
        mutate(exprs = ((exprs - min(exprs)) / max(exprs - min(exprs))))
    }

  }) %>% do.call(rbind, .)

  if(is.null(tmp)) stop("No genes found!\n")

  tmp = tmp %>%
    filter(!is.na(gene))
  tmp %>%
    ggplot(aes(x=gene, y=gr)) +
    geom_point(aes(color=exprs, size=expresses)) +
    theme(axis.text.x = element_text(angle=90, hjust=1,vjust=.5)) +
    scale_color_continuous(name = "scaled mean expr.", low="gray", high="red") +
    scale_size_continuous(name="fraction pos.") +
    ylab(group.by) +
    scale_radius(limits = c(0,NA), range=c(0,5)) +
    scale_x_discrete(limits = genes[genes %in% tmp$gene],
                     breaks = genes[genes %in% tmp$gene])
}


sce.plot.dotplot = function(sce, genes, group.by = "cell_type"){
    stopifnot(group.by %in% colnames(colData(sce)))

    genes = unique(genes)
    use.genes = genes[genes %in% rownames(sce)]

    stopifnot(length(use.genes) > 0)

    cd = sce.colData(sce)
    umi = assay(sce, "umi")
    exprs = t(t(umi[use.genes, ]) / colSums(umi))
    umi = umi[use.genes, ]

    umi.df = reshape2::melt(as.matrix(umi)) %>%
        as_tibble() %>%
        dplyr::rename(gene = Var1,
                      cell_id = Var2,
                      umi = value) %>%
        mutate_if(is.factor, as.character)

    ex.df = reshape2::melt(as.matrix(exprs)) %>%
        as_tibble() %>%
        dplyr::rename(gene = Var1,
                      cell_id = Var2,
                      exprs = value) %>%
        mutate_if(is.factor, as.character)

    ex.df %>%
        left_join(umi.df, by=c("cell_id", "gene")) %>%
        left_join(cd, by="cell_id") %>%
        dplyr::rename(gr = group.by) %>%
        group_by(gr, gene) %>%
        summarise(mexprs = mean(exprs),
                  frac.exprs = sum(umi>0) / n()) %>%
        group_by(gene) %>%
        mutate(mexprs = mexprs - min(mexprs)) %>%
        mutate(mexprs = mexprs / max(mexprs)) %>%
        ggplot(aes(gr, gene, size=frac.exprs, color = mexprs)) +
        ylim(use.genes) +
        geom_point() +
        scale_radius(limits = c(0, NA), range = c(0,5), name="fraction exprs.") +
        #scale_color_continuous(name = "scaled exprs.", low="gray", high = "red") +
        scale_color_gradientn(name = "scaled exprs.", colors = c("gray", rev(viridis::inferno(100)))) +
        theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
        labs(y = "", x = "")
}

sce.geneid.to.genename <- function(sce, genes){
  stopifnot("gene_name" %in% colnames(rowData(sce)))

  out <- lapply(genes, function(gene){
    rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)]
  }) %>% do.call("c",.)

  return(out)
}

sce.heatmap <- function(sce, genes, order.by = "cell_type", sort.by = "stage.ord", mc.cores = 10){
  stopifnot(length(genes) > 1)

  sce.list <- sce.split.by.col(sce, order.by,n.cores = mc.cores)
  sce.list <- sce.list[order(names(sce.list))]

  gene.ids <- sce.geneid.to.genename(sce, genes)
  print(gene.ids)
  print("getting exprs")
  ex.list <- pbmcapply::pbmclapply(names(sce.list), function(i){
    x <- sce.list[[i]]
    pb <- sce.pseudobulk(x, sort.by, mc.cores = 1, min_cells = 2)
    pb <- pb[,sort(colnames(pb))]
    pb <- t(t(pb[gene.ids,]) / colSums(pb))
    colnames(pb) <- paste0(i, "//", colnames(pb))
    return(pb)
  }, mc.cores = mc.cores)

  print("combining")
  ex <- do.call(cbind, ex.list)
  rownames(ex) <- genes

  ex.annot <- data.frame(
    cell.type = str_split_fixed(colnames(ex), "//", 2)[,1],
    stage = str_split_fixed(colnames(ex), "//", 2)[,2],
    row.names = colnames(ex)
  )
  pheatmap::pheatmap(ex,
                     cluster_cols = F,
                     show_colnames =F,
                     annotation_col = ex.annot,
                     clustering_distance_rows = "correlation",
                     scale = "row")
}

sce.make.rowData <- function(sce, meta.data){
  colnames(meta.data) <- c("gene_id", "gene_name", "ortho_id", "ortho_name", "ortho_type")

  rw <- tibble(
    gene_id = rownames(sce)
  ) %>%
    left_join(
      meta.data %>%
        group_by(gene_id, gene_name, ortho_type) %>%
        summarise(ortho_id = paste0(ortho_id, collapse=","),
                  ortho_name = paste0(ortho_id, collapse=","))
    )

  if(!is.null(ncol(rowData(sce)))){
    rowData(sce) <- rw %>%
      left_join(
        as.data.frame(rowData(sce)) %>%
          rownames_to_column("gene_id")
      ) %>%
      as.data.frame() %>%
      column_to_rownames("gene_id")
  }else{
    rowData(sce) <- rw %>% as.data.frame() %>% column_to_rownames("gene_id")
  }


  return(sce)
}

sce.colData <- function(sce){
  out <- colData(sce) %>%
    as.data.frame() %>%
    rownames_to_column("cell_id") %>%
    as_tibble()

  colnames(out) <- c("cell_id", colnames(colData(sce)))

  return(out)
}

sce.proportion.plot <- function(sce, group.by = "cell_type", along = "stage.ord"){
  require(tidyverse)
  require(SingleCellExperiment)

  stopifnot(group.by %in% colnames(colData(sce)))
  stopifnot(along %in% colnames(colData(sce)))

  if(is.null(cbar.name)){
    cbar.name <- fill
  }

  sce.colData(sce) %>%
    dplyr::rename(gr = group.by)%>%
    dplyr::rename(al = along) %>%
    group_by(gr, al) %>%
    summarise(n = n()) %>%
    group_by(al) %>%
    mutate(prop = n/sum(n)) %>%
    ggplot(aes(x = al, y = prop, fill = gr)) +
    geom_bar(stat="identity") +
    theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
    scale_fill_discrete(name=cbar.name)

}

sce.proportion.plot.multi <- function(..., fill = "cell_type", along="HUM corr. stage", feat = "species", ncol=2, cbar.name = NULL, drop=F, scales="fixed"){
  require(tidyverse)
  require(SingleCellExperiment)

  sce.list <- list(...)


  if(is.null(cbar.name)){
    cbar.name <- fill
  }

  for(i in sce.list){
    stopifnot(fill %in% colnames(colData(i)))
    stopifnot(along %in% colnames(colData(i)))
  }

  df <- do.call(rbind, lapply(sce.list, function(x){
    tmp <- sce.colData(x)[,c(fill, along, feat)]
    return(tmp)
  }))

  df %>%
    dplyr::rename(fl = fill) %>%
    dplyr::rename(al = along) %>%
    dplyr::rename(sp = feat) %>%
    group_by(fl, al, sp) %>%
    summarise(n = n()) %>%
    group_by(al, sp) %>%
    mutate(prop = n/sum(n)) %>%
    ggplot(aes(x = al, y = prop, fill = fl)) +
    geom_bar(stat="identity") +
    theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
    facet_wrap(vars(sp), ncol=ncol, drop = drop, scales=scales) +
    scale_fill_discrete(name=cbar.name)

}

sce.propline <- function(..., facet = "cell_type", group="TissueID", along="HUM corr. stage", feat = "species", ncol=2, cbar.name = NULL, drop=F, scales="fixed", min.cells = 10){
  require(tidyverse)
  require(SingleCellExperiment)

  sce.list <- list(...)


  if(is.null(cbar.name)){
    cbar.name <- facet
  }

  for(i in sce.list){
    stopifnot(facet %in% colnames(colData(i)))
    stopifnot(along %in% colnames(colData(i)))
  }

  df <- do.call(rbind, lapply(sce.list, function(x){
    tmp <- sce.colData(x)[,c(facet, along, feat, group)]
    return(tmp)
  }))

  df %>%
    dplyr::rename(fl = facet) %>%
    dplyr::rename(al = along) %>%
    dplyr::rename(sp = feat) %>%
    dplyr::rename(gr = group) %>%
    group_by(fl, al, sp, gr) %>%
    summarise(n = n()) %>%
    filter(n >= min.cells) %>%
    group_by(al, sp, gr) %>%
    mutate(prop = n/sum(n)) %>%
    ggplot(aes(x = al, y = prop, color = sp)) +
    geom_point() +
    stat_summary(aes(group=sp), geom = "line", fun.y = "median") +
    stat_summary(aes(group=sp), geom = "linerange", fun.ymax = max, fun.ymin = min) +
    theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
    facet_wrap(vars(fl), ncol=ncol, drop = drop, scales=scales) +
    scale_fill_discrete(name=cbar.name)

}


sce.pca <- function(sce, n=50){

  umi <- assay(sce, "umi")

  cat("HVG identification\n")
  hvg <- get.info.genes(umi)

  cat("Normalizing\n")
  nrm <- t(sqrt(t(umi[hvg,]) / colSums(umi)))

  cat("Running PCA\n")
  set.seed(1234)
  pca.results <- irlba::prcomp_irlba(t(nrm), n=n)

  feature.loadings <- pca.results$rotation
  rownames(feature.loadings) <- hvg

  cell.embeddings <- pca.results$x
  rownames(cell.embeddings) <- colnames(umi)

  colnames(feature.loadings) <- colnames(cell.embeddings) <- paste0("PC", sprintf("%02d", 1:ncol(cell.embeddings)))

  sce@metadata$hvg <- hvg
  sce@metadata$feature.loadings <- feature.loadings
  reducedDim(sce, "pca") <- cell.embeddings

  return(sce)
}

sce.smoothed.exprs <- function(sce, g){
  stopifnot("pca" %in% names(sce@reducedDims))
  # stopifnot(g %in% rownames(sce))

  if(length(g) == 0) {
    return(
      sce.colData(sce) %>%
        add_column(smoothed = 0)
    )
  }

  if(g %in% rownames(sce)){
    cell.embeddings <- reducedDim(sce, "pca")

    umis <- assay(sce, "umi")[g,]
    sf <- colSums(assay(sce,"umi"))

    a <- (cell.embeddings %*% t( matrix(log1p(umis / sf), nrow = 1) %*% cell.embeddings))[,1]
    a[a<= 0] <- 0

    return(
      sce.colData(sce) %>%
        add_column(smoothed = a)
    )

  }else{
    return(
      sce.colData(sce) %>%
        add_column(smoothed = 0)
    )
  }


}

sce.smoothed.exprs.mat <- function(sce, g){
  stopifnot("pca" %in% names(sce@reducedDims))
  stopifnot(sum(rownames(sce) %in% g) == length(g))


  # if(g %in% rownames(sce)){
  cell.embeddings <- reducedDim(sce, "pca")

  umis <- assay(sce, "umi")[g,,drop=F]
  sf <- colSums(assay(sce,"umi"))

  a <- (cell.embeddings %*% t( log1p(umis[g,] / sf) %*% cell.embeddings))
  a[a <= 0] <- 0

  return(
    a
  )
}


sce.run.harmony = function(sce, vars_use = c("batch"), use.redDim = "pca",
                           reference_values = NULL){
    stopifnot(use.redDim %in% reducedDimNames(sce))

    library(harmony)
    meta = sce.colData(sce) %>%
        as.data.frame() %>%
        column_to_rownames("cell_id")

    l = reducedDim(sce, use.redDim)

    set.seed(1234)
    hrm = HarmonyMatrix(data_mat = l,
                  meta_data = meta,
                  do_pca = FALSE,
                  verbose = T,
                  vars_use=vars_use,
                  reference_values = reference_values)

    reducedDim(sce, "harmony") = hrm

    return(sce)
}

get.nn <- function(x, y=x, k=25, metric="cosine", return.self = F){
  if(!return.self){
    if(sum(dim(x) == dim(y)) == 2){
      if(sum(x == y) == length(x)){
        k <- k+1
      }
    }
  }

  pd <- reticulate::import("pynndescent")

  cat("Building index\n")
  idx <- pd$NNDescent(x, metric = metric)

  cat("Finding nearest neighbors\n")
  nn <- idx$query(y, k=as.integer(k))

  names(nn) <- c("nn.idx", "nn.dist")

  nn$nn.idx <- nn$nn.idx + 1

  if(!return.self){
    nn <- lapply(nn, function(x) x[,2:ncol(x)])
  }

  return(nn)
}

#' Gets top N markers for each cluster
#'
#' Uses tf-idf ordering to get the top N markers of each cluster.  For each cluster, either the top N or all genes passing the hypergeometric test with the FDR specified, whichever list is smallest.
#'
#' Term Frequency - Inverse Document Frequency is used in natural language processing to identify terms specific to documents.  This function uses the same idea to order genes within a group by how predictive of that group they are.  The main advantage of this is that it is extremely fast and gives reasonable results.
#'
#' To do this, gene expression is binarised in each cell so each cell is either considered to express or not each gene.  That is, we replace the counts with \code{toc > zeroCut}.  The frequency with which a gene is expressed within the target group is compared to the global frequency to calculate the tf-idf score.  We also calculate a multiple hypothesis corrected p-value based on a hypergeometric test, but this is extremely permissive.
#'
#' @export
#' @param toc Table of counts.  Must be a sparse matrix.
#' @param clusters Vector of length \code{ncol(toc)} giving cluster membership.
#' @param N Number of marker genes to return per cluster.
#' @param FDR False discover rate to use.
#' @param expressCut Value above which a gene is considered expressed.
#' @return data.frame with top N markers (or all that pass the hypergeometric test) and their statistics for each cluster.
#' @examples
#' #Calculate markers of clusters in toy data
#' mrks = quickMarkers(scToy$toc,scToy$metaData$clusters)
#' \dontrun{
#' #Calculate markers from Seurat (v3) object
#' mrks = quickMarkers(srat@assays$RNA@count,srat@active.ident)
#' }
quickMarkers = function(toc,clusters,N=10,FDR=0.01,expressCut=0.9){
  #Convert to the more manipulable format
  toc = as(toc,'dgTMatrix')
  w = which(toc@x>expressCut)
  #Get the counts in each cluster
  clCnts = table(clusters)
  nObs = split(factor(rownames(toc))[toc@i[w]+1],clusters[toc@j[w]+1])
  nObs = sapply(nObs,table)
  #Calculate the observed and total frequency
  nTot = rowSums(nObs)
  tf = t(t(nObs)/as.integer(clCnts[colnames(nObs)]))
  ntf = t(t(nTot - nObs)/as.integer(ncol(toc)-clCnts[colnames(nObs)]))
  idf = log(ncol(toc)/nTot)
  score = tf*idf
  #Calculate p-values
  qvals = lapply(seq_len(ncol(nObs)),function(e)
                 p.adjust(phyper(nObs[,e]-1,nTot,ncol(toc)-nTot,clCnts[colnames(nObs)[e]],lower.tail=FALSE),method='BH'))
  qvals = do.call(cbind,qvals)
  colnames(qvals) = colnames(nObs)
  #Get gene frequency of second best
  sndBest = lapply(seq_len(ncol(tf)),function(e) apply(tf[,-e,drop=FALSE],1,max))
  sndBest = do.call(cbind,sndBest)
  colnames(sndBest) = colnames(tf)
  #And the name
  sndBestName = lapply(seq_len(ncol(tf)),function(e) (colnames(tf)[-e])[apply(tf[,-e,drop=FALSE],1,which.max)])
  sndBestName = do.call(cbind,sndBestName)
  colnames(sndBestName) = colnames(tf)
  rownames(sndBestName) = rownames(tf)
  #Now get the top N for each group
  w = lapply(seq_len(ncol(nObs)),function(e){
             o = order(score[,e],decreasing=TRUE)
             if(sum(qvals[,e]<FDR)>=N){
               o[seq(N)]
             }else{
               o[qvals[o,e]<FDR]
             }
                 })
  #Now construct the data.frame with everything
  ww = cbind(unlist(w,use.names=FALSE),rep(seq_len(ncol(nObs)),lengths(w)))
  out = data.frame(gene = rownames(nObs)[ww[,1]],
                   cluster = colnames(nObs)[ww[,2]],
                   geneFrequency = tf[ww],
                   geneFrequencyOutsideCluster = ntf[ww],
                   geneFrequencySecondBest = sndBest[ww],
                   geneFrequencyGlobal = nTot[ww[,1]]/ncol(toc),
                   secondBestClusterName = sndBestName[ww],
                   tfidf = score[ww],
                   idf = idf[ww[,1]],
                   qval = qvals[ww],
                   stringsAsFactors=FALSE)
  return(out)
}

sce.tfidf = function(sce, group_col = "cell_type", N = 10, FDR = .01, expressCut = .9){
    stopifnot(group_col  %in% colnames(colData(sce)))

    X = assay(sce, "umi")
    gr = colData(sce)[,group_col]

    quickMarkers(X, gr, N = N, FDR = FDR, expressCut = expressCut)
}


# Approximation of expression by SVD

CreateSceApprox = setClass("sceApprox",
         slots = list(
             D = "matrix",
             U = "matrix",
             V = "matrix",
             colData = "tbl"
         ))

sce.approx.expression = function(sce, n = 100, genes = rownames(sce)){
    X = assay(sce, "umi")[genes, ]
    X = t(X) / colSums(X)
    X = sqrt(X)

    x.svd = irlba::irlba(X, nv = n)

    D = diag(x.svd$d)
    U = x.svd$u
    V = x.svd$v

    rownames(U) = rownames(X)
    rownames(V) = colnames(X)

    CreateSceApprox(D = D, U = U, V = V, colData = sce.colData(sce))
}


setMethod("show",
          "sceApprox",
          function(object){
              cat("nCells:", nrow(object@U), "\n")
              cat("nGenes:", nrow(object@V), "\n")
              cat("nDims:", ncol(object@U), "\n")
              cat("\n")
              cat("colData colnames:\n" )
              cat(colnames(object@colData))
          })

setGeneric("approximate",
           def = function(object, gene){
               standardGeneric("approximate")
           })

#` Approximation of expression using SVD on SingleCellExperiment object
#`
#` @param object SingleCellExperiment object to approximate expression to
#` @param gene Gene or genes as character or character vector.
#`             Needs to be present as rownames in the SCE object
setMethod("approximate",
          "sceApprox",
          function(object, gene){
              stopifnot(all(gene %in% rownames(object@V)))

              appr = t(object@D %*% t(object@U)) %*% t(object@V[gene, , drop=F])
              appr[appr < 0] = 0

              return(appr)
          })

setGeneric("approximate_tbl",
           def = function(object, gene){
               standardGeneric("approximate_tbl")
           })

setMethod("approximate_tbl",
          "sceApprox",
          function(object, gene){
              appr = approximate(object, gene)
              reshape2::melt(appr) %>%
                  dplyr::rename(cell_id = Var1,
                                gene = Var2) %>%
                  mutate(cell_id = as.character(cell_id)) %>%
                  #rownames_to_column("cell_id") %>%
                  left_join(object@colData, by="cell_id")
          })