Setup

To begin with, we will load a bunch of libraries that we will use for the analysis. Note the use of reticulate as we will be ussing functions from the sklearn library

library(readr)
library(magrittr)
library(ggplot2)
library(ElPiGraph.R)

library(reshape)
library(Rtsne)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(rgl)
library(DDRTree)
## Loading required package: irlba
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:reshape':
## 
##     expand
library(irlba)

library(reticulate)
sk <- import("sklearn.manifold")

Now we will load the data available from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60781. For this example we will use both the raw count and the RPKM data.

Let us begin by loading the raw count and formatting them.

BaseDir <- "~/Git/Roscoff/Datasets/Schlitzer/"

AllCells_RAW <- dir(path = paste0(BaseDir, "RAW"), full.names = TRUE)
AllCells_RAW <- AllCells_RAW[grep("readcount", AllCells_RAW, ignore.case = TRUE)]

CellRC <- list()

for(i in 1:length(AllCells_RAW)){
  cat(i, " ")
  CellRC[[i]] <- read.delim(file = AllCells_RAW[i], header = FALSE)
  if(i != 1){
    if(any(CellRC[[1]]$V1 != CellRC[[i]]$V1)){
      print(paste("incpmpatible gene names for cell", i))
    }
  }
}
## 1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99  100  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176  177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208  209  210  211  212  213  214  215  216  217  218  219  220  221  222  223  224  225  226  227  228  229  230  231  232  233  234  235  236  237  238  239  240  241  242  243  244  245  246  247  248  249  250  251

each element of the list is a 2D tibble. We begin by checking that all the files have the same genes.

all(
  sapply(CellRC[-1], function(x){
    all(x$V1 == CellRC[[1]]$V1)
  })
)
## [1] TRUE

Since this is indeed the case, we can combine the gene expression into a single gene count matrix and removed the original file. We also extract and save extra infor contained in the data.

ExpMat_RAW <- lapply(CellRC, function(x){
  x$V2
})

ExpMat_RAW <- do.call(rbind, ExpMat_RAW)
colnames(ExpMat_RAW) <- CellRC[[1]]$V1
rownames(ExpMat_RAW) <- sapply(strsplit(AllCells_RAW, "_"), "[[", 2)

rm(CellRC)
gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1391872 74.4    2164898 115.7  1770749  94.6
## Vcells 5293970 40.4   24949369 190.4 25920948 197.8
InfoCol <- (ncol(ExpMat_RAW)-4):ncol(ExpMat_RAW)

ExtraInfo <- ExpMat_RAW[,InfoCol]
ExpMat_RAW <- ExpMat_RAW[, -InfoCol]

Now, we load the RPKM data

RPK_Expression <- read_delim(file = "~/Git/Roscoff/Datasets/Schlitzer/GSE60781_RPKM.txt.gz", col_names = TRUE, delim = "\t")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   geneName = col_character()
## )
## See spec(...) for full column specifications.
Genes <- unlist(RPK_Expression$geneName)
RPK_Expression <- data.matrix(RPK_Expression[,-1])
rownames(RPK_Expression) <- Genes

RPK_Expression <- t(RPK_Expression)

Just to make sure, we check that we have the same information

all(rownames(RPK_Expression) %in% rownames(ExpMat_RAW))
## [1] TRUE
all(rownames(ExpMat_RAW) %in% rownames(RPK_Expression))
## [1] TRUE
all(colnames(RPK_Expression) %in% colnames(ExpMat_RAW))
## [1] TRUE
all(colnames(ExpMat_RAW) %in% colnames(RPK_Expression))
## [1] FALSE
MissingGenes <- which(!(colnames(ExpMat_RAW) %in% colnames(RPK_Expression)))
head(ExpMat_RAW[, MissingGenes])
##        Aasdh Ablim2 Adam22 Add1 Bcl7b Cacnb2 Cd46 Clcn3 Cmah Dapk2 Ddo
## RMD072     0      0      0    0   107      0    0     0    0     0   0
## RMD073     0      0      0  460     1      0    0     0    0     0   0
## RMD074     0      0      0    0     0      0    0     8    0     0   0
## RMD075     0      0      0    0     0      0    0     0    0     0   0
## RMD076    10      0      0    3     2      0    0     0    0     0   0
## RMD077     0      0      0    0     0      0    0     0    0     0   0
##        E2f3 Elf2 Fuk Gpx4 Grk6 H2-gs10 H2-t9 Hdac7 Htra2 Il15ra Kcnmb2
## RMD072   17    0   0   33    0       0     0     0     0      0      0
## RMD073    0    0   0    5    1       0     0     0     0      0      0
## RMD074    0    0   0    5    0       0     0     0     0      0      0
## RMD075   40    0   0    0  160       0     0     0     0      0      0
## RMD076    0    0   0   38  107       0     0     0     0      0      0
## RMD077    0    0   0    3    0       0     0     0     0      0      0
##        Kif1b Lpcat4 Man2b2 Mat2b Mrpl13 Mrpl33 Mrpl4 Mrps11 Mrps17 Naaa
## RMD072   192      0     14    37      0     18   119      0      3    0
## RMD073   279      0    217   139      0    116     4      0    119  250
## RMD074     0      0      5     1      0    154   289      0      1    0
## RMD075     0      0      0     0      0    230     1      0     42    0
## RMD076     0      0      0     0      0     56    97      0     84    0
## RMD077     0      0     18     5      0     75    83      0     56    0
##        Nox1 Nrcam Nrk Oscp1 Pink1 Pkd1l3 Plrg1 Rab22a Rab2a Rab2b Rab40c
## RMD072    0     0   0     0     0      0     0      0     0     0      0
## RMD073    0     0   0     0     2      0     0      0    65     0      0
## RMD074    0     0   0     0     0      0     0      0   517     0      0
## RMD075    0     0   0     0     0      0     0      0     6     0      0
## RMD076    0     0   0     0     0      0     0      0     0     0      0
## RMD077    0     0   0     0     0      0     0      0     5     0      0
##        Rab4b Rab8b Rgs7 Sash1 Sh3d19 Sik2 Smc4 Sp1 Sprr2h Srpk1 Tcra Tcrb
## RMD072     5     0    0     0      0    0  567  20      0     2    0    0
## RMD073    22     0    0     0      0    0  647   0      0     0    0    0
## RMD074    33     0    0     0      0    0   22  94      0     0    0    0
## RMD075     0     1    0     0      0    0 1046 396      0     0    0    0
## RMD076     0     0    0    85      0    0    3   1      0   143    0    0
## RMD077     0     5    0     0      0    0 1128 919      0    22    0    0
##        Tmeff2 Txnrd2 Ush2a Vav2 Yaf2 a2ld1 abca5 abca8a aprt art3 cacna1d
## RMD072      0      0     0    0    0     0     1      0    0    0       0
## RMD073      0      0     0    0    0     0     0      0    0    0       0
## RMD074      0      0     0    0    0     0     0      0    0    0       0
## RMD075      0      0     0    0    0     0     0      0    0    0       0
## RMD076      0      0     0    0    0     0     0      0    0    0       0
## RMD077      0      0     0    0    0     0     0      0    0    0       0
##        cdc6 cml3 helB limd1 msi2 nek2 notch1 oasl1 piwiL4 rage scmh1
## RMD072    1    0    0     0    0    0      0     0      0    0     0
## RMD073    0    0    0     0    0    0      0     0      0    0     0
## RMD074    0    0    0     0    0    0      0     0      0    0     0
## RMD075    0    0    0     0    0    0      0     0      0    0     0
## RMD076    0    0    0     0    0    0      0     0      0    0     0
## RMD077    0    0    0     0    0    0      0     0      0    0     0
##        slc18a2 slc26a6 slc43a2 smoc2 tubg1 wdr4 wiz
## RMD072       0       0       0     0     0    0   0
## RMD073       0       0       0     0     0    0   0
## RMD074       0       0       0     0     0    0   0
## RMD075       0       0       0     0     0    0   0
## RMD076       0       0       0     0     0    0   0
## RMD077       0       0       0     0     0    0   0

Apparently not … lets continue anyway. The population asssigned to the cells can be obtained by the article. They are sequential, but there are some missing info.

CellID <- as.integer(sapply(strsplit(rownames(ExpMat_RAW), "RMD"), "[[", 2))
names(CellID)[CellID <= 167] <- "BM_CD115+_CDP"
names(CellID)[CellID >= 168 & CellID <= 263] <- "BM_PreDC"
names(CellID)[CellID >= 264] <- "BM_MDP"

names(CellID)[CellID == 78] <- "N/A"
names(CellID)[CellID == 89] <- "N/A"
names(CellID)[CellID == 100] <- "N/A"
names(CellID)[CellID == 121] <- "N/A"
names(CellID)[CellID == 137] <- "N/A"
names(CellID)[CellID == 158] <- "N/A"
names(CellID)[CellID == 160] <- "N/A"
names(CellID)[CellID == 166] <- "N/A"
names(CellID)[CellID == 186] <- "N/A"
names(CellID)[CellID == 203] <- "N/A"
names(CellID)[CellID == 208] <- "N/A"
names(CellID)[CellID == 220] <- "N/A"

Feature selection

To simplify the analysis and to reduce noise, we will only select overdispersed genes. There are vaious approch to do this. Overall, we will apply the same as STREAM. To begin with we will filter genes expressed in less that 1% of the cells.

Thr <- round(.01*nrow(ExpMat_RAW))

table(colSums(ExpMat_RAW > 0) > Thr)
## 
## FALSE  TRUE 
## 17245 12618
ToKeep <- colnames(ExpMat_RAW)[colSums(ExpMat_RAW > 0) > Thr]

ExpMat_RAW_Fil <- log2(ExpMat_RAW[, ToKeep] + 8)
RPK_Expression_Fil <- log2(RPK_Expression[, colnames(RPK_Expression) %in% ToKeep] + 8)

and then select only overdispersed genes

X <- apply(ExpMat_RAW_Fil, 2, mean)
Y <- apply(ExpMat_RAW_Fil, 2, sd)
XSeq <- seq(from = 0, to = max(X), by=.1)

plot(X, Y, xlab = "mean gene expression", ylab = "sd of gene expression")

LOE <- loess(Y ~ X, span = .05)
points(XSeq, predict(LOE, newdata = data.frame(X = XSeq)), type = 'l', col = 'red')

Cols <- ifelse(predict(LOE) < Y, "red", "blue")

OEGenes <- colnames(ExpMat_RAW_Fil)[Y > predict(LOE)]

plot(X, Y, xlab = "mean gene expression", ylab = "sd of gene expression", col = adjustcolor(Cols, alpha.f = .4), pch = 20)

ExpMat_RAW_Fil_OE <- ExpMat_RAW_Fil[, OEGenes]

We will repeat the same approach for RPKM data

X <- apply(RPK_Expression_Fil, 2, mean)
Y <- apply(RPK_Expression_Fil, 2, sd)
XSeq <- seq(from = 0, to = max(X), by=.1)

plot(X, Y, xlab = "mean gene expression", ylab = "sd of gene expression")

LOE <- loess(Y ~ X, span = .05)
points(XSeq, predict(LOE, newdata = data.frame(X = XSeq)), type = 'l', col = 'red')

Cols <- ifelse(predict(LOE) < Y, "red", "blue")

OEGenes_RPK <- colnames(RPK_Expression_Fil)[Y > predict(LOE)]

plot(X, Y, xlab = "mean gene expression", ylab = "sd of gene expression", col = adjustcolor(Cols, alpha.f = .4), pch = 20)

RPK_Expression_Fil_OE <- RPK_Expression_Fil[, OEGenes_RPK]

We can also compare the selected overdispersed genes

table(OEGenes_RPK %in% OEGenes)
## 
## FALSE  TRUE 
##  1577  4709
table(OEGenes %in% OEGenes_RPK)
## 
## FALSE  TRUE 
##  2462  4709

Look at data

As a first step we check PCA

PCA <- prcomp_irlba(ExpMat_RAW_Fil_OE, n = 2, retx = TRUE)

PlotPCA <- data.frame(PCA$x, type = names(CellID))

ggplot(PlotPCA, aes(x = PC1, y = PC2, col = type)) + geom_point(size = 1, alpha=.7) + labs(title = "PCA of Raw data")

tSNE <- Rtsne(ExpMat_RAW_Fil_OE, initial_dims = 50, perplexity = 10)

PlotPCA <- data.frame(D1 = tSNE$Y[,1], D2 = tSNE$Y[,2], type = names(CellID))

ggplot(PlotPCA, aes(x = D1, y = D2, col = type)) + geom_point(size = 1, alpha=.7) + labs(title = "tSNE of Raw data")

tictoc::tic()
MLLE.Model <- sk$LocallyLinearEmbedding(n_neighbors=as.integer(ceiling(nrow(ExpMat_RAW_Fil_OE)*.2)),
                                  n_components = as.integer(3),
                                  method = 'modified',
                                  eigen_solver = 'dense',
                                  n_jobs = as.integer(1),
                                  random_state = as.integer(10),
                                  neighbors_algorithm = 'kd_tree')

MLLE.Fit <- MLLE.Model$fit(ExpMat_RAW_Fil_OE)
MLLE.Emb <- MLLE.Fit$embedding_
tictoc::toc()
## 2.765 sec elapsed
PlotPCA <- data.frame(D1 = MLLE.Emb[,1], D2 = MLLE.Emb[,2], type = names(CellID))

ggplot(PlotPCA, aes(x = D1, y = D2, col = type)) + geom_point(size = 1, alpha=.7) + labs(title = "MLLE of Raw data")

Pseudotime (MLLE)

boxplot(dist(MLLE.Emb))

Curve <- computeElasticPrincipalCurve(X = MLLE.Emb, NumNodes = 10, Do_PCA = FALSE, TrimmingRadius = .15, ProbPoint = .9, nReps = 100,
                                      n.cores = 4, ClusType = "FORK", ParallelRep = TRUE)
## [1] "Creating a sock cluster with 4 nodes"
## [1] "Using parallel sampling analysis. Limited output available"
## [1] "Generating the initial configurations"
## [1] "Exporting initial configuration parameters"
## [1] "Exporting initial configuration parameters"
## [1] "Graphical output will be suppressed"
## [1] "Exporting parameters"
## [1] "Analysis is running ... no output will be shown"
## [1] "Sit back and relax, this may take a long time ..."
## 3.582 sec elapsed
## [1] "Constructing average tree"
## [1] "Creating a chain in the 1st PC with 2 nodes"
## [1] "Using a user supplied cluster. Updating the value of X"
## [1] "Computing EPG with 10 nodes on 1000 points and 3 dimensions"
## [1] "Using a user supplied cluster. It must contains the data points in a matrix X"
## [1] "Exporting the additional variables to the cluster"
## Nodes = 2 3 4 5 6 7 8 9 
## BARCODE  ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE MSEP    FVE FVEP    UE  UR  URN URN2    URSD
## 0||10    0.00108 10  9   8   0   0   0   0.0006209   0.0005494   0.9234  0.9323  0.0002393   0.0002197   0.002197    0.02197 0

## 1.123 sec elapsed
## [[1]]

PlotPG(MLLE.Emb, Curve[[101]], BootPG = Curve[1:100], GroupsLab = names(CellID), DimToPlot = 1:3, NodeLabels = 1:30, LabMult = 4)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Extended <- ExtendLeaves(X = MLLE.Emb, TargetPG = Curve[[101]], ControlPar = .9, TrimmingRadius = .1) 
## [[1]]

## 
## [[1]]

PlotPG(MLLE.Emb, Extended, GroupsLab = names(CellID), DimToPlot = 1:3,
       NodeLabels = 1:nrow(Extended$NodePositions), LabMult = 4)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Net <- ConstructGraph(PrintGraph = Extended)
Part <- PartitionData(X = MLLE.Emb, NodePositions = Extended$NodePositions)
PrjStruct <- project_point_onto_graph(X = MLLE.Emb,
                                      NodePositions = Extended$NodePositions,
                                      Edges = Extended$Edges$Edges,
                                      Partition = Part$Partition)

Paths <- GetSubGraph(Net, "end2end")

Paths_Norm <- lapply(Paths, function(x){
  if(x[1] != "12"){
    rev(x)
  } else {
    x
  }
})
PT_MLLE <- getPseudotime(ProjStruct = PrjStruct, NodeSeq = names(Paths_Norm$Path_1))

hist(PT_MLLE$Pt)

SmoothGenes <- CompareOnBranches(X = ExpMat_RAW_Fil_OE, Paths = Paths_Norm, TargetPG = Curve[[1]],
                                 GroupsLab = names(CellID), n.cores = 4, ClusType = "FORK",
                                 Partition = Part$Partition, PrjStr = PrjStruct, Span = .2, Mode = "MI", ReturnGenes = TRUE)
## [1] "Feature selection by mutual information"
boxplot(SmoothGenes, ylab = "Mutual information")

CompareOnBranches(X = ExpMat_RAW_Fil_OE, Paths = Paths_Norm, TargetPG = Curve[[1]], GroupsLab = names(CellID),
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .3, Mode = "Var",
                  Features = names(SmoothGenes)[SmoothGenes > quantile(SmoothGenes, .98)])
## [1] "16 pages will be plotted"
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

Pseudotime (PCA)

RAW_PCA <- irlba::prcomp_irlba(ExpMat_RAW_Fil_OE, 50)


boxplot(dist(RAW_PCA$x))

Curve <- computeElasticPrincipalCurve(X = RAW_PCA$x, NumNodes = 10, Do_PCA = FALSE, TrimmingRadius = 100, ProbPoint = .9,
                                      nReps = 100, ParallelRep = TRUE, n.cores = 4, ClusType = "FORK")
## [1] "Creating a sock cluster with 4 nodes"
## [1] "Using parallel sampling analysis. Limited output available"
## [1] "Generating the initial configurations"
## [1] "Exporting initial configuration parameters"
## [1] "Exporting initial configuration parameters"
## [1] "Graphical output will be suppressed"
## [1] "Exporting parameters"
## [1] "Analysis is running ... no output will be shown"
## [1] "Sit back and relax, this may take a long time ..."
## 3.903 sec elapsed
## [1] "Constructing average tree"
## [1] "Creating a chain in the 1st PC with 2 nodes"
## [1] "Using a user supplied cluster. Updating the value of X"
## [1] "Computing EPG with 10 nodes on 1000 points and 50 dimensions"
## [1] "Using a user supplied cluster. It must contains the data points in a matrix X"
## [1] "Exporting the additional variables to the cluster"
## Nodes = 2 3 4 5 6 7 8 9 
## BARCODE  ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE MSEP    FVE FVEP    UE  UR  URN URN2    URSD
## 0||10    122.3   10  9   8   0   0   0   89.66   77.44   0.9342  0.9432  21.22   11.43   114.3   1143    0

## 1.118 sec elapsed
## [[1]]

PlotPG(RAW_PCA$x, Curve[[101]], Curve[1:100], GroupsLab = names(CellID), DimToPlot = 1:3, NodeLabels = 1:30, LabMult = 4)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Extended <- ExtendLeaves(X = RAW_PCA$x, TargetPG = Curve[[1]], ControlPar = .9) 
## [[1]]

## 
## [[1]]

PlotPG(RAW_PCA$x, Extended, GroupsLab = names(CellID), DimToPlot = 1:3,
       NodeLabels = 1:nrow(Extended$NodePositions), LabMult = 4)
## [[1]]

## 
## [[2]]

## 
## [[3]]

Net <- ConstructGraph(PrintGraph = Extended)
Part <- PartitionData(X = RAW_PCA$x, NodePositions = Extended$NodePositions)
PrjStruct <- project_point_onto_graph(X = RAW_PCA$x,
                                      NodePositions = Extended$NodePositions,
                                      Edges = Extended$Edges$Edges,
                                      Partition = Part$Partition)

Paths <- GetSubGraph(Net, "end2end")

Paths_Norm <- lapply(Paths, function(x){
  if(x[1] != "11"){
    rev(x)
  } else {
    x
  }
})
PT_PCA <- getPseudotime(ProjStruct = PrjStruct, NodeSeq = names(Paths_Norm$Path_1))
hist(PT_PCA$Pt)

plot(PT_PCA$Pt, PT_MLLE$Pt)

SmoothGenes <- CompareOnBranches(X = ExpMat_RAW_Fil_OE, Paths = Paths_Norm, TargetPG = Curve[[1]], GroupsLab = names(CellID),
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .2, Mode = "MI", ReturnGenes = TRUE, n.cores = 4, ClusType = "FORK")
## [1] "Feature selection by mutual information"
boxplot(SmoothGenes, ylab = "Mutual information")

CompareOnBranches(X = ExpMat_RAW_Fil_OE, Paths = Paths_Norm, TargetPG = Curve[[1]], GroupsLab = names(CellID),
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .3, Mode = "Var",
                  Features = names(SmoothGenes)[SmoothGenes > quantile(SmoothGenes, .98)])
## [1] "16 pages will be plotted"
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

CompareOnBranches(X = ExpMat_RAW_Fil_OE, Paths = Paths_Norm, TargetPG = Curve[[1]], GroupsLab = names(CellID),
                  Partition = Part$Partition, PrjStr = PrjStruct, Span = .3, Mode = "Var", Features = 9)
## [1] "Feature selection by variance"
## [1] "1 pages will be plotted"