Introduction
technical metrics
- Ruchan’s scoring calls:
Clustering
cluster markers
- Manual inspection of marker list:
- Clustered heatmap of curated markers
Cell type enrichment across treatment groups
- Variations in frequency
  - Empirical compositions
  - sccomp modelling (beta-binomial)

theme_umap <- function() {
  theme_void() +
    theme(panel.background = element_rect(fill = "grey", color = "black", size = 2),
          legend.position = c(1,0),
          legend.justification = c(1.1,0),
          legend.direction = "horizontal",
          legend.text = element_text(angle = 45, hjust = 1, vjust = 1),
          legend.title = element_blank(),
          plot.margin = unit(c(1,1,1,1), "mm"))
  
  
}


# function to overlay values onto the UMAP embeddings
hex_plots <-
  function(data,
           dim1,
           dim2,
           color,
           facets = NULL,
           CI = 0.05,
           fun = "mean",
           bins = 200,
           divergent_scale = F) {
    message("calc CI")
    # calculate the quantilles for CI limits onthe color scale.
    if (CI) {
      lims <- quantile(pull(data, color), probs = c(CI / 2, 1 - CI / 2))
      
      # if user supplied CI leads to 0 values ef no range, set automatically to 1%
      if (lims[2] == 0) {
        CI <- 0.01
        lims <-
          quantile(pull(data, color), probs = c(CI / 2, 1 - CI / 2))
      }
      # if still 0 values still, do not set any CI
      if (lims[2] == 0) {
        lims <- c(NA, NA)
      }
      
    } else {
      lims <- c(NA, NA)
    }
    
    message("compute plot")
    plot <- ggplot(data, aes_string(dim1, dim2)) +
      stat_summary_hex(
        bins = bins,
        fun = fun,
        aes(z = !!ensym(color), color = ..value..),
        lwd = 0.1
      ) +
      ggtitle(color) +
      theme_mrl() +
      guides(fill = guide_none()) +
      theme_umap()
    
    
    if (divergent_scale) {
      plot <- plot  +
        scale_fill_gradient2(
          midpoint = 0.5,
          low = "royalblue",
          mid = "grey80",
          high = "red4",
          limits = lims,
          oob = squish
        ) +
        scale_color_gradient2(
          midpoint = 0.5,
          low = "royalblue",
          mid = "grey80",
          high = "red4",
          limits = lims,
          oob = squish
        )
    } else {
      plot <- plot +
        scale_fill_viridis(limits = lims, oob = squish) +
        scale_color_viridis(limits = lims, oob = squish)
    }
    
    if (is.null(facets)) {
      return(plot)
    } else {
      return(plot + facet_wrap(as.formula(facets)))
    }
  }

ridges_plots <- function(data,dim1,dim2) {
  
  ggplot(data, 
       aes_string(dim1, 
                  dim2)) +
  stat_density_ridges(aes(fill = factor(stat(quantile))),
    geom = "density_ridges_gradient", calc_ecdf = TRUE,
    quantiles = c(0.01,0.05, 0.1,0.9,0.95,0.99), quantile_lines = TRUE,
    color = NA) +
  facet_grid(curated_types~., scales = "free_y", space = "free_y") +
  scale_fill_manual(labels = c("0.01","0.05", "0.1","0.1-0.9","0.9","0.95","0.99"),
                    values = c("black","grey80","black","grey80","black","grey80","black"))
}






SingleR_heatmap <- function(singleR_results, meta, k, keepBest = T) {
  
  # Input
  # singleR results: a list with 1/ scores matrix (rows = cluster/cells, col = refs), 2/ calls table on each cells (cells as rows)
  # metadata table with cell barcodes (or cluster calls) as rownames
  # k parameter for cutree, in order to split the dendrogram of cells/clusters
  
  library(ComplexHeatmap)
  
  # add barcodes back to the scores matrix
  rownames(singleR_results[[1]]) <- rownames(singleR_results[[2]])
  
  #print(rownames(singleR_results[[1]]))
  #print(rownames(meta))
  #enforce ordering of cells in the same in scores matrix and metadata
  singleR_results <- singleR_results[[1]][rownames(meta),]
  
  # scale scores per cell to % of max score
  singleR_results <- singleR_results / matrixStats::rowMaxs(singleR_results)
  
  # Only keep refs that have best score for at least one cells
  if(keepBest) {
    singleR_results <- singleR_results[,matrixStats::colMaxs(singleR_results) == 1]
  }
  

  # Print final dim before plotting
  cat("Dimension of matrix to plot after filering",dim(singleR_results))



  # hierrachical clustering
  row_clust <- hclust(dist(t(singleR_results)))

  col_clust <- hclust(dist(singleR_results))

  # cut tree and assign color for heatmap annotation
  cluster_split <- paste0("cluster ", cutree(col_clust, k = k))

  cluster_col <- ggsci::pal_futurama()(k)

  names(cluster_col) <- unique(cluster_split)

  #plot heatmap
  Heatmap(t(singleR_results),
        col = c(viridis(1000),"#ff0000"),
        show_column_names = F,
        bottom_annotation = HeatmapAnnotation(meta),
        top_annotation = HeatmapAnnotation("cluster" = cluster_split, 
                                           col = list("cluster" =cluster_col)),
        cluster_rows = row_clust,
        cluster_columns = col_clust,
        use_raster = T,
        raster_quality = 1)
  
  
  
  
  
  
}

### plotting function draft for singleR objects
plot_singleR_heatmap <- function(singleR_results, scale = T, kmeans = 10, return_scores = F) {
  
  #### usage options
  # scale: scale cells to max score fraction. Useful to highlight best score and minimize library size effect
  # kmeans: apply kmeans to cluster cells. faster than hierarchical clustering which too slow when dealing with many cells
  # TODO: add option to do hierarchical clustering on the kmeans cluster
  # TODO: add option to print max score in heatmap given matrix size in not too big
  
  if("SummarizedExperiment" %in% class(singleR_results)) {
    # if raw singleR output
    # convert to matrix and metadata
    scores = as.matrix(singleR_results$scores)
    calls = as.data.frame(singleR_results[, colnames(singleR_results) != "scores"])
  } else {
    # if singleR output converted to list of scores and meta, extract each component
    scores = singleR_results$scores
    calls = singleR_results$calls
  }
 
  # add rownames t score matrix
  rownames(scores) <- rownames(calls)
  
  # scale per cell at max value
  scores_scaled <- scores / matrixStats::rowMaxs(scores)
  
  #labels of actual best rho scre if printing score on heatmap. Not used rn
  labels <- (scores_scaled == 1) * scores %>% round(digits = 2)
  
  labels[labels == 0] <- ""
  
  
  if(scale) {
    mat <- scores_scaled
  } else {
    mat <- scores
  }
  
  if(return_scores) {
   return(mat)
 }
  
  
  hm <- ComplexHeatmap::Heatmap(
    mat,
    col = c(viridis(100), "red"),
    km = kmeans,
    cluster_rows = F,
    show_row_names = F,
    use_raster = T
  )
  hm <- draw(hm)
  
 clustering_results <- ComplexHeatmap::row_order(hm) %>%
    enframe("hm_cluster", "id") %>% 
    unnest()
 
 clustering_results <- data.frame(cell.id = rownames(mat)) %>%
   tibble::rowid_to_column("id") %>%
   left_join(clustering_results)
 
 
    
}

Introduction

Here we will visualize the results from the initial analyses: clustering, marker genes, clusters compositional variations.

# import clustering and umap data
seurat <- qread("results/NK_analysis/seurat_objects/analysisNK_Ruchan.qs") 

meta <- seurat@meta.data %>%
  tibble::rownames_to_column("cell.id") %>%
  left_join(
    seurat@reductions$umap@cell.embeddings %>% as.data.frame() %>% tibble::rownames_to_column("cell.id")
  )

DimPlot(seurat)

technical metrics

umap_settings <- paste0("UMAP_", c(1, 2))

p <- ggplot(meta, aes_string(umap_settings[1], umap_settings[2])) +
  geom_hex(bins = 200, fill = "grey")

background_tiles <- ggplot_build(p)$data[[1]] %>%
  dplyr::select(x,y) %>%
  geom_point(data = ., aes(x,y), color = "grey", size = rel(0.5))

meta %>%
  #mutate(percent.rb = log1p(percent.rb))  %>%
  hex_plots(umap_settings[1],
            umap_settings[2],
            "percent.rb",
            CI = 0.01,
            fun = "mean")

cat("\n\n")


meta %>%
  #mutate(percent.rb = log1p(percent.rb))  %>%
  hex_plots(umap_settings[1],
            umap_settings[2],
            "percent.mt",
            CI = 0.01,
            fun = "mean")

Ruchan’s scoring calls:

meta %>%
  group_by(predicted.celltype.l2) %>%
  dplyr::filter(n() > 1) %>%
  ungroup() %>%
  {
    ggplot(., aes_string(umap_settings[1], umap_settings[2])) +
      geom_hex(aes(fill = predicted.celltype.l2, alpha = log(..ndensity..)),
               bins = 200) +
      geom_density2d(color = "black", bins = 8) +
      scale_alpha(range = c(0.2, 1)) +
      theme_mrl() +
      theme(panel.background = element_rect(fill = 'grey90', color = NA)) +
      scale_fill_manual(values = rev(palette)) +
      scale_color_manual(values = rev(palette)) +
      geom_label_repel(
        data = . %>%
          group_by(predicted.celltype.l2) %>%
          summarize(
            !!umap_settings[1] := median(UQ(sym(umap_settings[1]))),!!umap_settings[2] := median(UQ(sym(umap_settings[2])))
          ),
        aes(fill = predicted.celltype.l2,
            label = predicted.celltype.l2),
        size = 6
      )
  }

Clustering

Final calls

meta <- meta %>%
  mutate(curated_cluster = seurat_clusters) %>%
  mutate(emulsion = orig.ident)

labels <- data.frame(
  "1" = "",
  "2" = "",
  "3" = "",
  "4" = "",
  "5" = "bright",
  "6" = "",
  "7" = "",
  "8" = "BM resident",
  "9" = "proliferating"
) %>% pivot_longer(everything(), names_to = "curated_cluster", values_to = "cluster_label") %>%
  mutate(curated_cluster = as.factor(parse_number(curated_cluster))) 



meta %>%
  left_join(labels) %>%
  mutate(cluster_label = fct_reorder(paste(
    curated_cluster, cluster_label
  ), as.numeric(curated_cluster))) %>%
  {
    ggplot(., aes_string(umap_settings[1], umap_settings[2])) +
      geom_hex(aes(fill = cluster_label, alpha = log(..ndensity..)),
               bins = 200) +
      geom_density2d(
        aes(group = cluster_label),
        bins = 4,
        contour_var = "ndensity",
        color = "black",
        size = 2
      ) +
      geom_density2d(aes(color = cluster_label),
                     bins = 4,
                     contour_var = "ndensity") +
      scale_alpha(range = c(0.2, 1)) +
      theme_mrl() +
      theme(panel.background = element_rect(fill = 'grey90', color = NA)) +
      scale_fill_manual(values = palette) +
      scale_color_manual(values = palette) +
      geom_label_repel(
        data = . %>%
          group_by(curated_cluster, cluster_label) %>%
          summarize(
            !!umap_settings[1] := median(UQ(sym(umap_settings[1]))),!!umap_settings[2] := median(UQ(sym(umap_settings[2])))
          ),
        aes(fill = cluster_label,
            label = curated_cluster),
        size = 6,
        point.padding = 30,
        min.segment.length = 10,
        max.overlaps = 1,
        show.legend = FALSE,
        fontface = "bold"
      ) +
      theme_void() +
      theme(panel.background = element_rect(
        fill = "grey",
        color = "black",
        size = 2
      ),
      legend.title = element_blank(),
      legend.text = element_text(size = rel(2), face = "bold"),
      #legend.spacing.y = unit(0.5, 'cm'),
      aspect.ratio = 0.7) +
      guides(alpha = guide_none(),
             fill = guide_legend(byrow = TRUE)) 
  }

T cells and NK cells from donor T23491 were emulsioned separately, providing an experimental control to our in silico partitioning of NK and T cells. It is reassuring that the NK cells selected without knowledge of emulsion information all come from the NK emulsion and no T cells carried over from donor T23491.

meta %>%
  dplyr::filter(id == "T23491") %>%
  group_by(emulsion) %>%
  mutate(emulsion = paste(emulsion, "(", n(), "cells)")) %>%
  nest() %>%
  mutate(
    plot = purrr::map2(
      data,
      emulsion,
      ~ ggplot(.x, aes_string(umap_settings[1], umap_settings[2])) +
        background_tiles +
        geom_bin_2d(bins = 100, aes(color = ..ncount.., fill = ..ncount..)) +
        theme_mrl() +
        scale_fill_viridis(
          trans = "sqrt",
          option = "C",
          n.breaks = 10
        ) +
        scale_color_viridis(
          trans = "sqrt",
          option = "C",
          n.breaks = 10
        ) +
        theme(
          legend.key.height = unit(1, "in"),
          legend.position = "none",
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.title.x = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.title.y = element_blank()
        ) +
        ggtitle(.y)
    )
  ) %>%
  pull(plot) %>%
  plot_grid(plotlist = ., ncol = 2)

cluster split by sample

p0 <- ggplot(meta, aes(emulsion)) +
  geom_bar() +
  #scale_y_log10() +
  theme_mrl() +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank())

p1 <- ggplot(meta, aes(emulsion, fill = curated_cluster, color = condition)) +
  geom_bar(position = "fill", size = 1) +
  scale_color_manual(values = c("red","black")) +
  #scale_y_log10() +
  theme_mrl() +
  scale_fill_manual(values = palette)

plot_grid(p0,p1, align = "vh", axis = "blrt", ncol = 1)

Basic QC metrics

p0 <- ggplot(meta, aes(curated_cluster)) +
  geom_bar() +
  geom_label(stat='count', aes(label=..count..), vjust=0.5) +
  #scale_y_log10() +
  theme_mrl() +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank())

p1 <- ggplot(meta, aes(curated_cluster, fill = emulsion, color = condition, alpha = condition)) +
  geom_bar(position = "fill", size = 1) +
  scale_color_manual(values = c("red","black")) +
  scale_fill_manual(values = fct_rev(palette)) +
  #scale_y_log10() +
  theme_mrl() +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_alpha_discrete(range = c(0.3, 0.8))

p2 <- ggplot(meta, aes(curated_cluster, nCount_RNA)) +
  geom_violin(fill = "grey") +
  scale_y_log10() +
  theme_mrl() +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none")

p3 <- ggplot(meta, aes(curated_cluster, nFeature_RNA)) +
  geom_violin(fill = "grey") +
  scale_y_log10() +
  theme_mrl() +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank())

p4 <- ggplot(meta, aes(curated_cluster, percent.mt)) +
  geom_violin() +
  theme_mrl() +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none")

p5 <- ggplot(meta, aes(curated_cluster, percent.rb)) +
  geom_violin(fill = "grey") +
  theme_mrl() +
  theme(
        legend.position = "none")


plot_grid(p0,p1,p2,p3,p4,p5, align = "vh", axis = "blrt", ncol = 1)

cluster markers

presto_markers <- qread("results/NK_analysis/clustering_typing/clusters_markers_table_harmony_NK_paired_filtered.qs") %>%
  dplyr::filter(!grepl("^RP[LS]|^MT-|^AC[0-9]", feature)) 

write_csv(presto_markers, "results/NK_analysis/clustering_typing/clusters_markers_table_harmony_NK_paired_filtered.csv")

presto_markers <- presto_markers %>%
  group_by(feature, group) %>%
  summarize(count = n(),
            logFC = mean(logFC),
            mlogPadj = mean(-log10(padj + 10^-250))) %>%
  ungroup()

write_csv(presto_markers, "results/NK_analysis/clustering_typing/clusters_markers_table_harmony_NK_paired_filtered_summarized.csv")

I use AUROC to prioritize marker genes with a very loose pass at AUC > 0.6. Then I count how many times a gene is deemed a marker across all paired comparisons between clusters. I retained here a list of a curated list of genes with high counts, based on biological knowledge.

Most plots showing gene expression overlayed on UMAPs are saved to file at figs/markers for the sake of brevity. I highlight some questions for cluster 1 below.

Manual inspection of marker list:

match_genes <- "CST7|^GNLY|^GZMB|^GZMM|^HOPX|^CD247|^CD7|^GZMA|^GZMH|^HLA|^IGFBP7|^ITGB2|^KLRD1|^KLRF1|^PRF1|^TYROBP|^CD99|^FCER1G|^IFITM2|^IL2RB|^KLRB1|^TRDC|^CCL4|^CCL5|^CD63|^CLIC1|^IL2RG|^LGALS1|^CD300A|^CD160|^CD53|^CD81|^LY6E|^TRBC1|^ZBTB16|^ADAM|^AHR|^AIF1|^BANK1|^BIRC5|^CCR6|^CD14|^CD302|^CD33|^CD36|^CD68|^CD79|^CD86|^CD93|^CLEC|^CTLA|^CTS|^EFHD2|^FGFBP2|^FGL2|^FOXO|^FOXP|^GZM|^HIF1A|^IKZF2|^IL1|^IL4|^IRF|^KLR|^LGALS|^NKG7|^S100A|^SPON2|^VAV|^VIM|^XCL|^ABHD17A|^ADAM|^AKR1C3|^ALCAM|^AREG|^ARAP2|^BACH2|^BTG1|^CAMK4|^CARD11|^CBLB|^CCL|^CXCL|^CCR|^CXCR|^CD7|^CD8|^FOXN3|^FYN|^IL32|^IL2|^IL6|^LEF1|^SELL|^MEF2|^NELL2|^NFKB|^PARP|^TGFB|^CD63|^FCER|^FCGR|^ROR|^STAT|^TRDC|^JUNB|^CD28|^COTL1|^ICOS|^LTB|^CD6|^JUN|^CCR7|^CD3|^CD44|^ITK|^TCF7|^RUNX|^CD5|^IL7R|^BCL2|^THEMIS|^CXCR4|^CD69|^ISG|^LTA|^GNG2|^IL23R|^ID2|^CD4|^TGFB|^IFN|^PDCD|^TNFRSF18|^CD27|^NCAM|^FAU|^IFI|^BCL|^OAS"



genes_to_plot <- presto_markers %>%
  dplyr::filter(count > 5) %>%
  #dplyr::filter(grepl(match_genes, feature)) %>%
  group_by(group) %>%
  arrange(-count) %>%
  dplyr::select(group, feature) %>%
  nest() %>%
  mutate(data = purrr::map(data, ~paste(.x$feature, collapse = ", "))) %>% 
  arrange(as.numeric(group)) %>%
  mutate(data = str_wrap(data, 40))

kable(genes_to_plot) %>%
  column_spec(2, "7in")

genes_to_plot <- presto_markers %>%
  dplyr::filter(count > 4) %>%
  dplyr::select(feature) %>%
  distinct() %>%
  #dplyr::filter(grepl(match_genes, feature)) %>%
  pull(feature)


genes_to_plot <- seurat@assays$RNA@data[genes_to_plot,] %>% #marker_genes_to_plot
  as.matrix() %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename_with(~paste0("gene_",.)) %>%
  tibble::rownames_to_column("cell.id")

meta %>%
  dplyr::select(cell.id,umap_settings[1],umap_settings[2], curated_cluster, source, condition) %>%
  left_join(genes_to_plot) %>%
  gather("feat", "val",  -!matches("^gene_")) %>%
  group_by(feat) %>%
  nest() %>%
  mutate(plots = purrr::map2(data,feat, ~ ggsave(paste0("figs/markers/NK/",.y,"_mean.png"),
                                                 hex_plots(.x,
                                                           umap_settings[1],
                                                           umap_settings[2],
                                                           "val",
                                                           #"source~condition",
                                                           CI = 0.05,
                                                           fun = "mean") + 
                                                   ggtitle(.y) +
            theme(axis.text.x = element_blank(),
                  axis.title.x = element_blank(),
                  axis.text.y = element_blank(),
                  axis.title.y = element_blank(),
                  axis.ticks.x = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.line.x = element_blank(),
                  axis.line.y = element_blank(),
                  panel.grid = element_blank(),
                  panel.background = element_rect(fill = 'grey', color = NA),
                  panel.border = element_rect(colour = "black", fill=NA),
                  #strip.text = element_blank(),
                  aspect.ratio = 0.7,
                  legend.position = c(1,0),
                  legend.justification = c(1,0),
                  legend.title = element_blank(),
                  legend.direction = "horizontal",
                  legend.key.height = unit(0.1, "in"),
                  legend.text = element_text(angle = 45, hjust = 1, vjust = 1.2))) 
          )
         )

Clustered heatmap of curated markers

Main heatmap

hm_genes <- meta %>%
  dplyr::select(cell.id, curated_cluster) %>%
  left_join(genes_to_plot) %>%
  gather("feat", "val",  -!matches("^gene_")) %>%
  group_by(feat, curated_cluster) %>%
  summarize(avg = mean(val), sd = sd(val))

hm_genes <- hm_genes %>%
  dplyr::select(-sd) %>%
  mutate(feat = gsub("gene_", "", feat)) %>%
  pivot_wider(names_from = "feat",values_from = "avg") %>%
  tibble::column_to_rownames("curated_cluster") %>%
  as.matrix() %>%
  scale()



hm <- ComplexHeatmap::Heatmap(t(hm_genes),
                        col = colorRamp2(quantile(hm_genes, c(0.025,0.5,0.975)),
                                         c("royalblue", "grey", "red4")),
                        row_names_gp = gpar(fontsize = 12, fontface = "bold"),
                        column_names_gp = gpar(fontsize = 16, fontface = "bold"),
                        column_names_rot = 45,
                        column_names_centered = T,
                        column_split = NULL) 



hm <- draw(hm)

ori_col_order <- column_order(hm)
ori_row_order <- row_order(hm)

hm <- ComplexHeatmap::Heatmap(t(hm_genes),
                        col = colorRamp2(quantile(hm_genes, c(0.025,0.5,0.975)),
                                         c("royalblue", "grey", "red4")),
                        row_names_gp = gpar(fontsize = 12, fontface = "bold"),
                        column_names_gp = gpar(fontsize = 16, fontface = "bold"),
                        column_names_rot = 0,
                        column_names_centered = T,
                        row_split = 4) 



hm <- draw(hm)

# %v%
#   columnAnnotation(link= anno_mark(at = 1:ncol(hm), side = "bottom", labels = colnames(hm), labels_rot = 90, link_height = unit(20, "mm")))
#

Zoom on parts

i=1
for(cluster in row_order(hm)) {
  
  cat("\n\n#### ", i, "\n\n")
  
  hm_tmp <- ComplexHeatmap::Heatmap(
    t(hm_genes)[ori_row_order[ori_row_order %in% cluster], ori_col_order],
    col = colorRamp2(quantile(hm_genes, c(0.025, 0.5, 0.975)),
                     c("royalblue", "grey", "red4")),
    row_names_gp = gpar(fontsize = 12, fontface = "bold"),
    column_names_gp = gpar(fontsize = 16, fontface = "bold"),
    column_names_rot = 45,
    column_names_centered = T,
    cluster_rows = F,
    cluster_columns = F
  )
  hm_tmp <- draw(hm_tmp)
  
  
  cat("\n\n")
  
  i <- i+1
}

1

2

3

4

Cell type enrichment across treatment groups

Variations in frequency

Empirical compositions

frequencies <- meta %>%
  #dplyr::filter(percent.BCR < 1) %>%
  group_by(source, condition, id, curated_cluster, .drop = F) %>%
  dplyr::summarize(count = n()) %>%
  ungroup() %>%
  group_by(source, condition, id) %>%
  mutate(freq = count / sum(count)) %>%
  left_join(labels) %>%
  mutate(cluster_label = fct_reorder(paste(
    curated_cluster, "\n", cluster_label
  ), as.numeric(curated_cluster))) 

plots <- frequencies %>%
  group_by(curated_cluster, cluster_label) %>%
  nest() %>%
  mutate(
    plot = purrr::map2(
      data,
      paste("cluster" ,curated_cluster),
      ~ ggplot(.x,
        aes(condition, freq, fill = condition)
      ) +
        geom_boxplot(width = 0.5, outlier.shape = NA, position = position_nudge(-0.3)) +
        geom_point(aes(size = count), position = position_nudge(0.2), alpha = 0.5) +
        #stat_summary() +
        #geom_jitter(fill = "grey", shape = 21, color = "black") +
        facet_wrap(.~source, nrow = 1, strip.position = "bottom") +
        theme_mrl() +
        #scale_y_log10() +
        scale_alpha_continuous(range = c(0.5, 1)) +
        scale_size_continuous(trans = "log10") +
        ggsignif::geom_signif(comparisons = list(c("HD", "MM")),
                              map_signif_level = F) +
        ggtitle(.y) +
        theme(legend.position = "none",
              axis.text.x = element_blank(),
              axis.title.x = element_blank(),
              axis.ticks.x = element_blank(),
              axis.title.y = element_blank()) +
        scale_fill_jco()
      
      
    )
  ) %>%
  arrange(curated_cluster) %>%
  pull(plot) 

plots[["legend"]] <- get_legend(plots[[1]] + theme(legend.position = "left",
                                                   legend.text = element_text(size = rel(2)),
                                                   legend.title = element_text(size = rel(2))))

plots[[1]] <- plots[[1]] + theme(axis.title.y = element_text(hjust = 0, angle = 90, size = rel(2))) +
  ylab("Frequencies")

plot_grid(plotlist = plots, nrow = 3, axis = "lbrt", align = "hv")

sccomp modelling (beta-binomial)

Blood

library(sccomp)



sccomp_Blood <- meta %>%
  dplyr::filter(source == "PBMC") %>%
  sccomp_glm(
           formula_composition = ~condition,
           .sample = id,
           .cell_group = curated_cluster,
           bimodal_mean_variability_association = TRUE)

plots <- plot_summary(sccomp_Blood) 

plot_grid(plots[[1]][[1]], plot_grid(plotlist = plots[2:3], nrow = 2), nrow = 1)

Bone marrow

sccomp_BM <- meta %>%
  dplyr::filter(source == "BMMC") %>%
  sccomp_glm(
           formula_composition = ~condition,
           .sample = id,
           .cell_group = curated_cluster,
           bimodal_mean_variability_association = TRUE)


plots <- plot_summary(sccomp_BM)

plot_grid(plots[[1]][[1]], plot_grid(plotlist = plots[2:3], nrow = 2), nrow = 1)

sccomp_joined <- meta %>%
  mutate(sample = paste0(id,source)) %>%
  group_by(condition) %>%
  mutate(donor = paste0("D",unclass(as.factor(id)))) %>%
  ungroup() %>%
  sccomp_glm(
           formula_composition = ~ condition + donor:source + condition:source,
           .sample = sample,
           .cell_group = curated_cluster,
           bimodal_mean_variability_association = TRUE,
           cores = 5)

full analysis

initial exploratory analysis

PPA

2025-01-20