diff --git a/R/DAVINCHI.R b/R/DAVINCHI.R index 8938461..9c89a28 100644 --- a/R/DAVINCHI.R +++ b/R/DAVINCHI.R @@ -14,7 +14,10 @@ pkgs <- c("Rcpp", "cluster", "mclust", "dendextend", -"BiocNeighbors") +"BiocNeighbors", +"future", +"future.apply", +"progressr",) .onAttach <- function(libname, pkgname){ diff --git a/R/import.R b/R/import.R index ed834e7..6da5b48 100644 --- a/R/import.R +++ b/R/import.R @@ -16,5 +16,8 @@ #' @importFrom stats wilcox.test #' @importFrom utils head #' @importFrom utils packageVersion +#' @importFrom future plan multicore +#' @importFrom future.apply future_lapply +#' @importFrom progressr handlers handler_txtprogressbar with_progress ## usethis namespace: end NULL \ No newline at end of file diff --git a/R/new.R b/R/new.R index 452fb6c..36a88aa 100644 --- a/R/new.R +++ b/R/new.R @@ -1,3 +1,7 @@ +library(future) +library(future.apply) +library(progressr) + #' Adaptive Leiden/Louvain clustering #' #' When the number of clusters is set, run louvain/leiden clustering in an adaptive manner. @@ -585,9 +589,6 @@ Proximity.Dependency.scatter <- function(gene.name, - - - #Other parameters #save.label <- "rna" #rna, peak as input #preprocess #normaliza.version <- "gene" #should be used when processing the data #preprocess @@ -873,6 +874,322 @@ Horizontal.Integration.Assemble <- function( +#Other parameters +#save.label <- "rna" #rna, peak as input #preprocess +#normaliza.version <- "gene" #should be used when processing the data #preprocess +# - rna, adt -> gene +# - atac -> gene, LSI_1, LSI_2, LSI_3 + +#clustering input parameters +# - L2.opt <- "Yes" #Yes, No - whether L2norm before finding the nearest neighbors or mclust +# - k.opt <- 40 #neighboring paramters for louvain + +#' Cross-sample (Horizontal) integration wrapper +#' +#' Cross-sample (Horizontal) integration +#' +#' @export +Horizontal.Integration.Assemble.Parallel <- function( + dataset.opts = NULL, + Y.list, + L.list, + coor.list, + k.arg.list = c(4, 6, 8, 10, 11, 12, 13, 15, 16, 18, 20, 22, 24), #the parameters for self-contrastive learning step + L4.arg = 50, #parameters + input.opt = "FineTune", #OnlyDeco, FineTune + h.opt = "first", #default, first + mod.opt = "all", #all, common + L2.in = "default", #default, L2norm, L2norm.joint + random.seed = 123, + smooth = F, + workers = 2, # number of cores/workers for parallelization + ram = 15, # in GB max RAM usage + sequential = F, + multi = "multicore" # multicore (mac and linux), multisession (for windows) + ){ + + + #sanity check + ######################################################### + if (is.null(dataset.opts)){ + stop("Please include a vector of character names corresponding to the slices.") + }#if + + if (sequential){ + #sequential execution + future::plan(future::sequential) + }else{ + #parallel execution + if (multi == "multisession"){ + future::plan(future::multisession, workers = workers) + }else{ + future::plan(future::multicore, workers = workers) + } + pkgs_to_load <- if (multi == "multisession") c("DaVinci", "progressr") else NULL + } + + options(future.globals.maxSize = ram * 1024^3) + + # progress bar settings + progressr::handlers("progress") + + options( + progressr.enable = TRUE, + progressr.enable_after = 0, + progressr.delay_stdout = FALSE + ) + + progressr::handlers(progressr::handler_txtprogressbar( + file = stdout(), + enable_after = 0 + )) + + #self-contrastive learning step + ######################################################### + + exhaustive.list <- progressr::with_progress({ + + p <- progressr::progressor(steps = length(Y.list) * length(k.arg.list)) + + future.apply::future_lapply(seq_along(Y.list), function(ii) { + + message(paste0("Working on Sample ", ii, " / ", length(Y.list))) + proj <- list() + s0 <- NULL + + + count <- 0 + for (k.arg in k.arg.list){ + message("Working on k=", k.arg) + + if (is.null(s0)){ + + ICAp.res <- manifoldDecomp_adaptive(Y.list[[ii]], + L.list[[ii]], + k = k.arg, + L4 = L4.arg, + L4_adaptive = 2, + to_drop = T, + save.complete = T, + verbose = F, + random.seed = random.seed) + }else{ + + ICAp.res <- manifoldDecomp_adaptive(Y.list[[ii]], + L.list[[ii]], + k = k.arg, + L4 = L4.arg, + L4_adaptive = 2, + to_drop = T, + save.complete = T, + shur0 = s0, + verbose = F, + random.seed = random.seed) + }#else + + count <- count+1 + proj[[count]] <- ICAp.res + + p(sprintf("Sample %d: k=%d", ii, k.arg)) + + if (is.null(s0)){ + s0 <- ICAp.res$shur0 + }#if + + }#for k.arg + + return(proj) + }, future.seed = TRUE, future.packages = pkgs_to_load) + + }) + + + #self contrastive learning step + ####################################################################################### + if (length(k.arg.list) == 1){ + + #create the object needed for the downstream analysis + embed.list <- list() + embed.list.finetune <- list() + dav.res.list <- list() + + for (ii in 1:length(Y.list)){ + embed.list[[ii]] <- exhaustive.list[[ii]][[1]] + embed.list.finetune[[ii]] <- exhaustive.list[[ii]][[1]] + dav.res.list[[ii]] <- exhaustive.list[[ii]][[1]] + }#for ii + + }else{ + + #self contrastive learning + ############################# + + embed.list <- future.apply::future_lapply(seq_along(exhaustive.list), function(ii) { + self_deco(exhaustive.list[[ii]], + LVs.filter.thr = 0.9, + freq = 1, + opt = "B") + }, future.seed = TRUE, future.packages = pkgs_to_load) + + + #run again to finetune + ############################# + + embed.list.finetune <- future.apply::future_lapply(seq_along(embed.list), function(ii) { + manifoldDecomp_adaptive(Y.list[[ii]], + L.list[[ii]], + k = nrow(embed.list[[ii]]$LVs), + B = embed.list[[ii]]$LVs, + L4 = L4.arg, + L4_adaptive = 2, + to_drop = T, + save.complete = T, + shur0 = exhaustive.list[[ii]][[1]]$shur0, + verbose = F, + random.seed = random.seed) + }, future.seed = TRUE, future.packages = pkgs_to_load) + + + #Finetune after self-contrastive learning + if (input.opt == "OnlyDeco"){ + + dav.res.list <- future.apply::future_lapply(seq_along(embed.list.finetune), function(ii) { + ICAp.res <- embed.list.finetune[[ii]] + #construct the pseudo ICAp.res objects + list(Z = t(embed$LVs.pair), + B = embed$LVs, + L4 = ICAp.res$L4, + shur0 = ICAp.res$shur0, + L1 = ICAp.res$L1, + L2 = ICAp.res$L2) + }, future.seed = TRUE, future.packages = pkgs_to_load) + + }else if (input.opt == "FineTune"){ + + dav.res.list <- embed.list.finetune + + }#else if + }#else + + + + if (length(dataset.opts)==1){ + + #no need to integrate here, directly return here + mat <- t(dav.res.list[[1]]$B) + mat <- as.matrix(mat) + mat.slice.id <- rep(dataset.opts[1], nrow(mat)) + #no integration.res + return(list(Y.list = Y.list, + coor.list = coor.list, + L.list = L.list, + embed.list = embed.list, + embed.list.finetune = embed.list.finetune, + dataset.opts = dataset.opts, + mat.slice.id = mat.slice.id, + mat = mat, + exhaustive.list = exhaustive.list)) + + }else{ + #Horizontal integration + ######################################################## + + if (h.opt == "default"){ + + integration.res <- Horizontal.Integration(Y.list, + L.list, + coor.list, + dav.res.list = dav.res.list, + LVs.filter.thr = 0.8, + mod = mod.opt, + remove.LV1 = F, + L2.option = L2.in) + + }else if (h.opt == "first"){ + + integration.res <- Horizontal.Integration.first(Y.list, + L.list, + coor.list, + dav.res.list = dav.res.list, + LVs.filter.thr = 0.8, + mod = mod.opt, + remove.LV1 = F, + L2.option = L2.in) + + }#else + + + #cleanup the output + ######################################################### + embed <- integration.res$LVs_embeddings + #make sure the sample names can align + sids <- unlist(lapply(strsplit( rownames(embed), "_"), function(x){paste0(x[-1], collapse = "_")})) + mat.slice.id <- unlist(lapply(strsplit(rownames(embed),"_"), function(x){x[1]})) + mat.slice.id <- dataset.opts[as.numeric(mat.slice.id)] + + rownames(embed) <- sids + mat <- as.matrix(embed) + + + + #smoothing first no matter used or not + ######################################################### + if (!smooth){ + return(list(Y.list = Y.list, + coor.list = coor.list, + L.list = L.list, + embed.list = embed.list, + embed.list.finetune = embed.list.finetune, + dataset.opts = dataset.opts, + mat.slice.id = mat.slice.id, + integration.res = integration.res, + mat = mat, + exhaustive.list = exhaustive.list)) + }else{ + mat.smooth <- mat + + smooth_updates <- future.apply::future_lapply(seq_along(dataset.opts), function(ii) { + print(ii) + + subpart.index <- which(mat.slice.id==dataset.opts[ii]) + subpart <- mat[subpart.index,] + subpart.smooth <- refinement.batch(subpart, + as.matrix(coor.list[[ii]])[rownames(subpart),], + neighbor.option = "KNN", + neighbor.arg = 8, + tasks = "continuous") + + return(list(idx = subpart.index, + val = (subpart+t(subpart.smooth))/2)) + }, future.seed = TRUE, future.packages = pkgs_to_load) + + for (update in smooth_updates) { + mat.smooth[update$idx, ] <- update$val + } + + return(list(Y.list = Y.list, + coor.list = coor.list, + L.list = L.list, + embed.list = embed.list, + embed.list.finetune = embed.list.finetune, + dataset.opts = dataset.opts, + mat.slice.id = mat.slice.id, + integration.res = integration.res, + mat = mat, + mat.smooth = mat.smooth, + exhaustive.list = exhaustive.list)) + + }#else + + }#else + + future::plan(sequential) # Explicitly close multisession workers by switching plan + +}#Horizontal.Integration.Assemble + + + + diff --git a/R/utils_integration.R b/R/utils_integration.R index 92d059e..f7104d0 100644 --- a/R/utils_integration.R +++ b/R/utils_integration.R @@ -711,10 +711,283 @@ MinMax <- function(x, val.min, val.max){ }#MinMax +#' Integrate multiple modalities +#' @import BiocNeighbors +#' @details This function requires `Seurat`. Make sure it is installed +#' @export +MultiModalIntegration <- function( + modals, + mats = modals, + library.sizes = NULL, + library.types = NULL, + baselines = NULL, + n_neighbors = 40, + n_neighbors_large = 50, + sigma.idx = n_neighbors, + snn.far.nn = T, + L2norm = "column", + sd.scale = 1, + cross.contant = 1e-4, + prune.SNN = 0, + kernel.power = 1){ + + + n_mod <- length(modals) + mod_names <- names(modals) + if (is.null(mod_names)) mod_names <- paste0("modal.", seq_len(n_mod)) + + if (n_mod < 2) stop("At least 2 modalities are required.") + + for (m in seq_len(n_mod)) { + if (sum(duplicated(rownames(modals[[m]]))) > 0) { + stop("The spot/bin names are not unique.") + } + }#for + + if (!requireNamespace("Seurat", quietly = TRUE)) { + stop("Seurat is required for this function but is not installed. Please install it.") + }#if + + #align all modalities to the row order of modals[[1]] + ref_rownames <- rownames(modals[[1]]) + for (m in seq_len(n_mod)) { + if (is.null(rownames(modals[[m]]))) { + stop(paste0("Modality ", mod_names[m], " has no valid row names. Please examine and assign proper row names.")) + } + if (!all(sort(ref_rownames) == sort(rownames(modals[[m]])))) { + stop(paste0("Row names of modality ", mod_names[m], " don't align with modality 1. Please check carefully.")) + } + modals[[m]] <- modals[[m]][ref_rownames, ] + mats[[m]] <- mats[[m]][ref_rownames, ] + }#for + message("Alignment test passed") + + message("n_neighbors = ", n_neighbors) + message("n_neighbors_large = ", n_neighbors_large) + message("L2 = ", L2norm) + + + #L2 normalization + ################################# + if (!is.null(L2norm)){ + if (L2norm=="row"){ + + message("Sample-wise L2Norm") + for (m in seq_len(n_mod)) { + modals[[m]] <- L2Norm(modals[[m]], MARGIN = 1) + mats[[m]] <- L2Norm(mats[[m]], MARGIN = 1) + }#for + + }else if (L2norm=="column"){ + + message("Column-wise L2Norm") + for (m in seq_len(n_mod)) { + modals[[m]] <- L2Norm(modals[[m]], MARGIN = 2) + mats[[m]] <- L2Norm(mats[[m]], MARGIN = 2) + }#for + + }#else + }#if + + + #build up all graph + ################################# + prebuild.indices <- vector("list", n_mod) + neighbors <- vector("list", n_mod) + neighbors.large <- vector("list", n_mod) + + for (m in seq_len(n_mod)) { + prebuild.indices[[m]] <- BiocNeighbors::buildIndex(modals[[m]], BNPARAM = BiocNeighbors::AnnoyParam()) + neighbors[[m]] <- FindNN(data = modals[[m]], number.of.NN = n_neighbors, prebuild.index = prebuild.indices[[m]]) + neighbors.large[[m]] <- FindNN(data = modals[[m]], number.of.NN = n_neighbors_large, prebuild.index = prebuild.indices[[m]]) + }#for + + + message("Calculating the modality weights") + ######################################################################################## + + #calculate the within and cross-modality distance + ################################# + + #nearest distance + NNdists <- lapply(neighbors, function(nb) nb$distance[, 1]) + + #within modality prediction and cross modality prediction + mat.pred <- lapply(seq_len(n_mod), function(m) { + lapply(seq_len(n_mod), function(k) PredictAssay(mats[[m]], neighbors[[k]]$index)) + }) + + #calculate the distance, within and across modalities + impute.dists <- lapply(seq_len(n_mod), function(m) { + lapply(seq_len(n_mod), function(k) ImputeDist(mats[[m]], mat.pred[[m]][[k]], 0)) + }) + + + #calculate the kernel width + ################################# + sds <- vector("list", n_mod) + if (snn.far.nn){ + for (m in seq_len(n_mod)) { + snn.matrix <- Seurat:::ComputeSNN(nn_ranked = neighbors[[m]]$index, prune = prune.SNN) + snn.width <- Seurat:::ComputeSNNwidth( + snn.graph = snn.matrix, + k.nn = n_neighbors, + l2.norm = F, + embeddings = mats[[m]], + nearest.dist = rep(0, nrow(modals[[m]])) + ) + sds[[m]] <- snn.width * sd.scale + }#for + }else{ + #calculate based on the sigma.idx neighbor + for (m in seq_len(n_mod)) { + sds[[m]] <- (neighbors[[m]]$distance[, sigma.idx] - NNdists[[m]]) * sd.scale + }#for + }#else + + #calculate within and cross modality kernel, and modality weights + ################################# + + kernels <- lapply(seq_len(n_mod), function(m) { + lapply(seq_len(n_mod), function(k) exp(-1 * impute.dists[[m]][[k]] / sds[[m]])) + }) + + #compute the modality weights + scores <- lapply(seq_len(n_mod), function(m) { + cross.mean <- Reduce("+", kernels[[m]][-m]) / (n_mod - 1) + MinMax(kernels[[m]][[m]] / (cross.mean + cross.contant), val.min = 0, val.max = 200) + }) + + exp.scores <- lapply(scores, exp) + score.sum <- Reduce("+", exp.scores) + weights <- lapply(exp.scores, function(es) es / score.sum) + + #reweigh the weight with the library size + ################################# + # library.sizes/library.types/baselines are now lists, one entry per modality + # modalities with NULL library.sizes[[m]] are skipped (ratio=1, no reweighting) + message("") + if (!is.null(library.sizes)){ + + print("Adjust the weights by library size.") + + #with internel parameters, defaults to this for these modality types if baselines[[m]] is not provided + baseline.rna <- 2300 + baseline.atac <- 7400 + + if (is.null(baselines)) baselines <- vector("list", n_mod) + if (is.null(library.types)) library.types <- as.list(rep("rna", n_mod)) + + for (m in seq_len(n_mod)) { + if (is.null(baselines[[m]])) { + if (!is.null(library.types[[m]]) && library.types[[m]] == "atac") { + baselines[[m]] <- baseline.atac + } else if (!is.null(library.types[[m]]) && library.types[[m]] == "rna") { + baselines[[m]] <- baseline.rna + } else { + message("No baseline for modality ", mod_names[m], " — skipping library size reweighting.") + library.sizes[[m]] <- NULL # treat as if no library size was provided + } + if (!is.null(baselines[[m]])) { + message("Reference UMI of modality ", mod_names[m], " is set to be: ", baselines[[m]]) + } + }#if is.null + }#for + + #with thresholding; modalities without a library.sizes entry get ratio=1 (no reweighting) + library.ratios <- lapply(seq_len(n_mod), function(m) { + if (!is.null(library.sizes[[m]])) { + message("Modality ", mod_names[m], ": adjusting weights with reference UMI.") + pmin(library.sizes[[m]] / baselines[[m]], 1) + } else { + message("Modality ", mod_names[m], ": no library size provided — weights not adjusted.") + rep(1, nrow(modals[[1]])) + } + }) + + weights <- lapply(seq_len(n_mod), function(m) weights[[m]] * library.ratios[[m]]) + weight.sum <- Reduce("+", weights) + weights <- lapply(weights, function(w) w / weight.sum) + + }else{ + print("No adjustment with reference UMI.") + }#else + + + message("Calculating the weighted KNN and SNN") + ######################################################################################## + + #calculate the weighted sum of the link weights + n_neighbors_k <- n_neighbors + + modal.union.neighbor.index <- list() + + for (ii in 1:nrow(modals[[1]])){ + + all_indices <- lapply(neighbors.large, function(nb) nb$index[ii, ]) + index.union <- unique(unlist(all_indices)) + + index.union.lw <- vapply(index.union, function(idx) { + total <- 0 + for (m in seq_len(n_mod)) { + if (idx %in% all_indices[[m]]) total <- total + weights[[m]][idx] + } + total + }, numeric(1)) + + #sort and take the maxmimal several + modal.union.neighbor.index[[ii]] <- index.union[order(index.union.lw, decreasing = T)[1:n_neighbors_k]] + + }#for ii + + weighted.index <- do.call(rbind, modal.union.neighbor.index) + weighted.dist <- modal.union.neighbor.index + + + #compute KNN + ######################################### + jj <- c(t(weighted.index)) + ii <- rep(1:nrow(weighted.index), each = ncol(weighted.index)) + + knn.mat <- Matrix::sparseMatrix( + i = ii, + j = jj, + x = 1, + dims = c(nrow(weighted.index), nrow(weighted.index)) + )#knn.mat + + diag(knn.mat) <- 1 + knn.mat <- knn.mat+Matrix::t(knn.mat)-knn.mat*Matrix::t(knn.mat) + + + #compute SNN + ######################################### + snn.mat <- Seurat:::ComputeSNN(nn_ranked = weighted.index, prune = prune.SNN) + + + names(weights) <- mod_names + if (!is.null(library.sizes)) names(library.sizes) <- mod_names + result <- list( + weights = weights, + weighted.index = weighted.index, + weighted.dist = weighted.dist, + knn.mat = knn.mat, + snn.mat = snn.mat, + library.sizes = library.sizes, + library.types = library.types, + baselines = baselines, + n_neighbors = n_neighbors, + n_neighbors_large = n_neighbors_large + ) + for (m in seq_len(n_mod)) { + result[[paste0("weight.", mod_names[m])]] <- weights[[m]] + }#for + return(result) +}#MultiModalIntegration