Setup

To begin with, we will load a bunch of libraries that we will use for the analysis.

Note the use of reticulate as we will be using functions from the sklearn python library

library(readr)
library(magrittr)
library(ggplot2)
library(parallel)

library(ElPiGraph.R)

library(reshape)
library(Rtsne)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(DDRTree)
## Loading required package: irlba
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:reshape':
## 
##     expand
library(irlba)

library(reticulate)
sk <- import("sklearn.manifold")

Now we will load the data available from http://blood.stemcells.cam.ac.uk/single_cell_atlas.html. Data are already filtered and log-transformed, so we will not include any preprocessing

Header <- read_delim(file = "Datasets/Nestorowa/nestorowa_corrected_log2_transformed_counts.txt.gz", col_names = FALSE, delim = " ", n_max = 1)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
ExpMat <- read_delim(file = "Datasets/Nestorowa/nestorowa_corrected_log2_transformed_counts.txt.gz", col_names = FALSE, delim = " ", skip = 1)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
Genes <- unlist(Header)
Cells <- unlist(ExpMat$X1)

ExpMat <- data.matrix(ExpMat[,-1])
colnames(ExpMat) <- Genes
rownames(ExpMat) <- Cells

then we load the additional inforlation and perform a bit of data cleaning

ExtraInfo_Head <- read_delim("Datasets/Nestorowa/all_cell_types.txt", col_names = FALSE, delim = "\t", n_max = 1)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
ExtraInfo <- read_delim("Datasets/Nestorowa/all_cell_types.txt", col_names = FALSE, delim = "\t", skip = 1)
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   X1 = col_character()
## )
## See spec(...) for full column specifications.
ExtraInfo_Bin <- data.matrix(ExtraInfo[,-1])
rownames(ExtraInfo_Bin) <- unlist(ExtraInfo$X1)
colnames(ExtraInfo_Bin) <- unlist(ExtraInfo_Head)

PopID_broad <- apply(ExtraInfo_Bin[,grep("broad", colnames(ExtraInfo_Bin))], 1, function(x){
  if(sum(x==1)>1 | sum(x==1)==0){
    return(NA)
  } else {
    return(which(x==1))
  }
})

PopID_broad <- colnames(ExtraInfo_Bin)[PopID_broad]
names(PopID_broad) <- unlist(ExtraInfo$X1)
names(PopID_broad) <- gsub(pattern = "-", replacement = ".", x = names(PopID_broad))

PopID_broad_Fil <- PopID_broad[rownames(ExpMat)]

table(PopID_broad_Fil, useNA = "if")
## PopID_broad_Fil
##   CMP_broad   GMP_broad  LMPP_broad LTHSC_broad   MEP_broad        <NA> 
##         317         120         246         267         354         341

Look at data

As a first step we check PCA

PCA <- prcomp_irlba(ExpMat, n = 2, retx = TRUE)

PlotPCA <- data.frame(PCA$x, type = PopID_broad_Fil)

ggplot(PlotPCA, aes(x = PC1, y = PC2, col = type)) + geom_point(size = 1, alpha=.7) + labs(title = "PCA of Raw data")

Fitting a tree

Now, we’re ready to compute the tree that we will be using to explore the structure of the data. We will be using a trimming radius, so we need a genreal idea of the distance distribution in the data.

RAW_PCA <- irlba::prcomp_irlba(ExpMat, 50)

boxplot(dist(RAW_PCA$x))

We can now fit a tree and optimize it.

Tree <- computeElasticPrincipalTree(X = RAW_PCA$x, NumNodes = 10, Do_PCA = FALSE,
                                    TrimmingRadius = 100,
                                    ICOver = "Density", DensityRadius = 100,
                                    FinalEnergy = "Penalized", alpha = .01,
                                    ParallelRep = TRUE,
                                    drawAccuracyComplexity = FALSE,
                                    drawPCAView = TRUE,
                                    drawEnergy = FALSE)
## [1] "Generating the initial configuration"
## [1] "Creating a line in the densest part of the data. DensityRadius needs to be specified!"
## [1] "Constructing tree 1 of 1 / Subset 1 of 1"
## [1] "The elastic matrix is being used. Edge configuration will be ignored"
## [1] "Computing EPG with 10 nodes on 1645 points and 50 dimensions"
## [1] "Using a single core"
## Nodes = 2 3 4 5 6 7 8 9 
## BARCODE  ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE MSEP    FVE FVEP    UE  UR  URN URN2    URSD
## 3||10    3111    10  9   2   3   0   0   2961    Inf 0.5543  -Inf    113.9   35.86   358.6   3586    0
## 1.452 sec elapsed
## [[1]]

Tree_Ext <- fineTuneBR(X = RAW_PCA$x, NumNodes = 40, Do_PCA = FALSE, 
             TrimmingRadius = 100,
             FinalEnergy = "Penalized", alpha = .01,
             InitNodePositions = Tree[[1]]$NodePositions,
             InitEdges = Tree[[1]]$Edges$Edges,
             drawAccuracyComplexity = FALSE, drawPCAView = TRUE,
             drawEnergy = FALSE)
## [1] "Constructing tree 1 of 1 / Subset 1 of 1"
## [1] "Computing EPG with 40 nodes on 1645 points and 50 dimensions"
## [1] "Using a single core"
## Nodes = 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
## BARCODE  ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE MSEP    FVE FVEP    UE  UR  URN URN2    URSD
## 3||40    2775    40  39  32  3   0   0   2645    Inf 0.6018  -Inf    53.79   76.24   3049    122000  0
## 5.788 sec elapsed
## [[1]]

We will also compute a set of optimized bootstrapped trees to explore the robustness of the derived structure.

Tree_Boot <- computeElasticPrincipalTree(X = RAW_PCA$x, NumNodes = 10, Do_PCA = FALSE,
                                    TrimmingRadius = 100,
                                    ICOver = "Density", DensityRadius = 100,
                                    FinalEnergy = "Penalized", alpha = .01,
                                    ProbPoint = .9, nReps = 100,
                                    n.cores = 4, ClusType = "FORK",
                                    ParallelRep = TRUE,
                                    drawAccuracyComplexity = FALSE,
                                    drawPCAView = FALSE,
                                    drawEnergy = FALSE)
## [1] "Creating a sock cluster with 4 nodes"
## [1] "Using parallel sampling analysis. Limited output available"
## [1] "Generating the initial configurations"
## [1] "Exporting initial configuration parameters"
## [1] "Exporting initial configuration parameters"
## [1] "Graphical output will be suppressed"
## [1] "Exporting parameters"
## [1] "Analysis is running ... no output will be shown"
## [1] "Sit back and relax, this may take a long time ..."
## 44.285 sec elapsed
## [1] "Constructing average tree"
## [1] "Creating a line in the densest part of the data. DensityRadius needs to be specified!"
## [1] "Using a user supplied cluster. Updating the value of X"
## [1] "Computing EPG with 10 nodes on 1000 points and 50 dimensions"
## [1] "Using a user supplied cluster. It must contains the data points in a matrix X"
## [1] "Exporting the additional variables to the cluster"
## Nodes = 2 3 4 5 6 7 8 9 
## BARCODE  ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE MSEP    FVE FVEP    UE  UR  URN URN2    URSD
## 1|0||10  284.8   10  9   5   0   0   0   201.7   176.7   0.9353  0.9433  67.08   15.97   159.7   1597    0
## 0.864 sec elapsed
cl <- makeForkCluster(4)

Tree_Ext_Boot <- parLapply(cl, Tree_Boot, function(x){
  fineTuneBR(X = RAW_PCA$x, NumNodes = 40, Do_PCA = FALSE, 
             TrimmingRadius = 100,
             FinalEnergy = "Penalized", alpha = .01,
             InitNodePositions = x$NodePositions,
             InitEdges = x$Edges$Edges,
             drawAccuracyComplexity = FALSE, drawPCAView = FALSE,
             drawEnergy = FALSE)[[1]]
})

stopCluster(cl)

By plotting the data we can make sure that our anlysis is reasonable.

PlotPG(RAW_PCA$x, TargetPG = Tree_Ext[[1]],
       Tree_Ext_Boot[1:100],
       GroupsLab = PopID_broad_Fil, DimToPlot = 1:3, LabMult = 4)
## [[1]]

## 
## [[2]]

## 
## [[3]]

The tree is a bit too “brancy”, we could fix this by changing the paramteer, but it’s actually simpler to filter short branches in the tree and refit the result.

Br_Fil <- CollapseBrances(RAW_PCA$x, TargetPG = Tree_Ext[[1]], Mode = "PointNumber", ControlPar = 50)
## [1] "Removing the bridge branch with nodes: 2 7"
Tree_Ext_Fil <- fineTuneBR(X = RAW_PCA$x, NumNodes = 50, Do_PCA = FALSE,
                                    TrimmingRadius = 100,
                                    ICOver = "Density", DensityRadius = 100,
                                    FinalEnergy = "Penalized", alpha = .01,
                                    InitNodePositions = Br_Fil$Nodes,
                                    InitEdges = Br_Fil$Edges,
                                    drawAccuracyComplexity = FALSE,
                                    drawPCAView = FALSE,
                                    drawEnergy = FALSE)
## [1] "Constructing tree 1 of 1 / Subset 1 of 1"
## [1] "Computing EPG with 50 nodes on 1645 points and 50 dimensions"
## [1] "Using a single core"
## Nodes = 39 40 41 42 43 44 45 46 47 48 49 
## BARCODE  ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE MSEP    FVE FVEP    UE  UR  URN URN2    URSD
## 1|1||50  2620    50  49  43  1   0   0   2481    Inf 0.6265  -Inf    75.59   63.09   3154    157700  0
## 3.362 sec elapsed
PlotPG(RAW_PCA$x, TargetPG = Tree_Ext_Fil[[1]],
       Tree_Ext_Boot[1:100],
       GroupsLab = PopID_broad_Fil, DimToPlot = 1:3, LabMult = 4)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Extended <- ExtendLeaves(X = RAW_PCA$x, TargetPG = Tree_Ext_Fil[[1]], ControlPar = .9) 
## [[1]]

## 
## [[1]]

PlotPG(RAW_PCA$x, Extended, GroupsLab = PopID_broad_Fil, DimToPlot = 1:3,
       NodeLabels = 1:nrow(Extended$NodePositions), LabMult = 5)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Pseudotime

To obtain the pseudotime we first need to get the relevant paths. We will use node 52 as root since LTHSC are predominantly associated with that branch.

Net <- ConstructGraph(PrintGraph = Extended)
Part <- PartitionData(X = RAW_PCA$x, NodePositions = Extended$NodePositions)
PrjStruct <- project_point_onto_graph(X = RAW_PCA$x,
                                      NodePositions = Extended$NodePositions,
                                      Edges = Extended$Edges$Edges,
                                      Partition = Part$Partition)

Branches <- GetSubGraph(Net, "branches")

sapply(Branches, length)
## Branch_1 Branch_2 Branch_3 Branch_4 Branch_5 Branch_6 
##       12       12        5        9       15        7
CellPerLeaf <- lapply(as.list(which(degree(Net)==1)), function(x) {
  Pop <- PopID_broad_Fil[Part$Partition == x]
  table(Pop)
})

CellPerLeaf
## $`51`
## Pop
## LTHSC_broad   MEP_broad 
##          32           1 
## 
## $`52`
## Pop
##   CMP_broad LTHSC_broad   MEP_broad 
##           1           2          20 
## 
## $`53`
## Pop
##  CMP_broad  GMP_broad LMPP_broad 
##         10         20          1 
## 
## $`54`
## Pop
##   CMP_broad   GMP_broad  LMPP_broad LTHSC_broad   MEP_broad 
##          10           8           1           2           3 
## 
## $`55`
## Pop
##   CMP_broad  LMPP_broad LTHSC_broad 
##           3           2           9
Paths <- GetSubGraph(Net, "end2end")

Paths_Norm <- lapply(Paths, function(x){
  if(x[length(x)] == "51"){
    return(rev(x))
  }
  
  if(x[1] == "51"){
    return(x)
  }
})

Paths_Norm <- Paths_Norm[sapply(Paths_Norm, function(x){!any(is.null(x))})]

Top get a better idea of how cells are associated with the different branches, we can also check the cells associted with the various branch

Branches <- GetSubGraph(Net, "branches")

TB <- sapply(Paths_Norm, function(x){
  sapply(Branches, function(y) {
    all(as.integer(y) %in% as.integer(x))
  })
})

1*TB
##          Path_1 Path_2 Path_3 Path_4
## Branch_1      1      0      1      0
## Branch_2      1      0      0      0
## Branch_3      0      0      1      0
## Branch_4      1      1      1      1
## Branch_5      0      1      0      0
## Branch_6      0      0      0      1

we can also look at how different cells are associted to different branches

BrPT <- which(degree(Net)>2)

Branches <- lapply(Branches, function(x){
  setdiff(as.integer(names(x)), BrPT)
})

CellsOnBrs <- lapply(Branches, function(x){
  which(Part$Partition %in% x)
})

Table <- sapply(CellsOnBrs, function(x){
  table(factor(PopID_broad_Fil[x], levels = unique(PopID_broad_Fil)))
})

Table
##             Branch_1 Branch_2 Branch_3 Branch_4 Branch_5 Branch_6
## LTHSC_broad       15       18        3      182       30       18
## LMPP_broad         9        1        1       20       82      132
## GMP_broad         11        2       10        1       87        9
## MEP_broad          9      280       50        1        1        1
## CMP_broad         86       10       37        2      127       52

This information can be used to give a name the differnet branches

names(Paths_Norm) <- c("LTHSC->CMP->MEP (1)", "LTHSC->CMP", "LTHSC->CMP->MEP (2)", "LTHSC->LMPP")

We are now able to get notable genes

SmoothGenes <- CompareOnBranches(X = ExpMat, Paths = Paths_Norm, TargetPG = Extended, GroupsLab = PopID_broad_Fil, n.cores = 4, ClusType = "FORK",
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .2, Mode = "KW", ReturnGenes = TRUE)
## [1] "Feature selection by Kruskal-Wallis rank sum test on differential branches"
boxplot(SmoothGenes, ylab = "Kruskal-Wallis pv", log='y')

and plot them

CompareOnBranches(X = ExpMat, Paths = Paths_Norm, TargetPG = Extended, GroupsLab = PopID_broad_Fil,
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .3, alpha = .05,
                  Features = names(SmoothGenes)[order(SmoothGenes, decreasing = FALSE)[1:27]], ScalePT = TRUE)
## [1] "3 pages will be plotted"
## [[1]]

## 
## [[2]]

## 
## [[3]]

CompareOnBranches(X = ExpMat, Paths = Paths_Norm, TargetPG = Extended, GroupsLab = PopID_broad_Fil,
                  n.cores = 4, ClusType = "FORK", alpha = .05,
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .2,
                  Mode = "MI", ModePar = "min", Features = 27,
                  ReturnGenes = FALSE, ScalePT = TRUE)
## [1] "Feature selection by mutual information"
## [1] "3 pages will be plotted"
## [[1]]

## 
## [[2]]

## 
## [[3]]

CompareOnBranches(X = ExpMat, Paths = Paths_Norm, TargetPG = Extended, GroupsLab = PopID_broad_Fil,
                  n.cores = 4, ClusType = "FORK", alpha = .05,
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .2,
                  Mode = "Var", ModePar = "min", Features = 27,
                  ReturnGenes = FALSE, ScalePT = TRUE)
## [1] "Feature selection by variance"
## [1] "3 pages will be plotted"
## [[1]]

## 
## [[2]]

## 
## [[3]]

Finally, we can also explore if we can find differences between the two paths with the same population structure

CompareOnBranches(X = ExpMat, Paths = Paths_Norm[c(1,3)], TargetPG = Extended, GroupsLab = PopID_broad_Fil,
                  n.cores = 4, ClusType = "FORK", alpha = .05, TrajCol = TRUE,
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .2,
                  Mode = "KW", Features = 27,
                  ReturnGenes = FALSE, ScalePT = TRUE)
## [1] "Feature selection by Kruskal-Wallis rank sum test on differential branches"
## [1] "3 pages will be plotted"
## [[1]]

## 
## [[2]]

## 
## [[3]]