Chapter 7 Comparing networks

7.1 Comparing multiple networks

The inference including hundreds of genes should be interpreted with caution especially for the low bootstrap number. For assessing the network stability, the users can compare and output F-measure by the function compareBNs, which accepts the list of multiple networks (bn) and output the pairwise F-measures. In this analysis, We perform the inference five times for 100 and 500 bootstrap numbers in the specific reactome.

fvres <- c()
cl <- parallel::makeCluster(12)
for (R in c(200,500)){
  n1 <- bngeneplot(pway, vstedTCGA, pathNum=13, returnNet=T, R=R, cl=cl)
  n2 <- bngeneplot(pway, vstedTCGA, pathNum=13, returnNet=T, R=R, cl=cl)
  n3 <- bngeneplot(pway, vstedTCGA, pathNum=13, returnNet=T, R=R, cl=cl)
  n4 <- bngeneplot(pway, vstedTCGA, pathNum=13, returnNet=T, R=R, cl=cl)
  n5 <- bngeneplot(pway, vstedTCGA, pathNum=13, returnNet=T, R=R, cl=cl)
  geneNum <- length(n1$av$nodes)
  
  nets <- list(n1$av, n2$av, n3$av, n4$av, n5$av)
  igList <- list(as.igraph(n1$av), as.igraph(n2$av), as.igraph(n3$av), as.igraph(n4$av), as.igraph(n5$av))
  allnets <- nets
  fvalues <- compareBNs(nets)
  fvres <- rbind(fvres, c("TCGA", 13, geneNum, R, "default", mean(fvalues), sd(fvalues)))
  
  ## Threshold 0.8
  n1av <- averaged.network(n1$str, threshold=0.8)
  n2av <- averaged.network(n2$str, threshold=0.8)
  n3av <- averaged.network(n3$str, threshold=0.8)
  n4av <- averaged.network(n4$str, threshold=0.8)
  n5av <- averaged.network(n5$str, threshold=0.8)
  
  nets <- list(n1av, n2av, n3av, n4av, n5av)
  fvalues <- compareBNs(nets)
  fvres <- rbind(fvres, c("TCGA", 13, geneNum, R, 0.8, mean(fvalues), sd(fvalues)))
}
resDF <- data.frame(fvres)
colnames(resDF) <- c("dataset","path_number","gene_number","R","threshold","mean","sd")
pathway_name <- sapply(resDF$path_number, function(x) pway@result$Description[as.numeric(x)])
resDF$label <- paste0(resDF$gene_number, "\n(", stringr::str_wrap(pathway_name,10),")")
parallel::stopCluster(cl)
kable(resDF)

dataset	path_number	gene_number	R	threshold	mean	sd	label
TCGA	13	48	200	default	0.879001362810166	0.0268588783377082	48 (RHO GTPases Activate Formins)
TCGA	13	48	200	0.8	0.910779196545874	0.0154934340979147	48 (RHO GTPases Activate Formins)
TCGA	13	48	500	default	0.910205528150544	0.00830851399185569	48 (RHO GTPases Activate Formins)
TCGA	13	48	500	0.8	0.936679898625571	0.0195192542150308	48 (RHO GTPases Activate Formins)

Plot the result for the visual assessment.

defa <- resDF %>% filter(threshold=="default") %>%
    ggplot(aes(x=label,
               y=as.numeric(mean),
               group=dataset,
               color=dataset,
               bg.color=dataset,
               segment.color=dataset)
           )+
    geom_jitter(size=4, width=0.1) +
    theme_minimal() +

    geom_text_repel(aes(label = R),
                     color = "white", size=4,
                     bg.r = .15) + 
    scale_color_brewer(palette = "Set1",
                         name = "Dataset",
                         # The same color scall will apply to both of these aesthetics.
                         aesthetics = c("color","bg.color","segment.color"))+
    geom_hline(yintercept=0.8, lty=3, color="red", size=1) +
        xlab("Gene number") + ylab("F-measure") + ggtitle("Default strength threshold")

high <- resDF %>% filter(threshold=="0.8") %>%
    ggplot(aes(x=label,
               y=as.numeric(mean),
               group=dataset,
               color=dataset,
               bg.color=dataset,
               segment.color=dataset)
    )+
    geom_jitter(size=4, width=0.1) +
    theme_minimal() +
    
    geom_text_repel(aes(label = R),
                    color = "white", size=4,
                    bg.r = .15) + 
    scale_color_brewer(palette = "Set1",
                       name = "Dataset",
                       # The same color scall will apply to both of these aesthetics.
                       aesthetics = c("color","bg.color","segment.color"))+
    geom_hline(yintercept=0.8, lty=3, color="red", size=1) +
    xlab("Gene number") + ylab("F-measure") + ggtitle("Strength threshold = 0.8")

defa / high

Take the intersection of networks.

as.bn(Reduce(igraph::intersection, igList))

FALSE 
FALSE   Random/Generated Bayesian network
FALSE 
FALSE   model:
FALSE    [NUP107][CDC20][KNL1][KIF2C|CDC20][BUB1B|KNL1][BIRC5|CDC20:KIF2C][CENPF|KNL1:KIF2C:BUB1B]
FALSE    [CDCA8|CDC20:KIF2C][ZWILCH|KNL1:BUB1B][CENPA|BIRC5:CDCA8:KIF2C][CENPE|CENPF:KNL1:BUB1B]
FALSE    [BUB1|CENPF:KIF2C:BUB1B][AURKB|BIRC5:CDC20][CENPI|CDCA8:KNL1:CENPE][KIF18A|CENPE:AURKB]
FALSE    [CENPK|CENPE:BUB1B][NUP85|BIRC5:CENPE][SPC25|BIRC5:AURKB][MAD2L1|BIRC5:CENPE:ZWILCH]
FALSE    [SPDL1|CENPK:BUB1B][DSN1|NUP85:KIF2C][SPC24|BIRC5:KIF2C:SPC25:AURKB]
FALSE    [SGO2|KIF18A:KNL1:BUB1:AURKB][PLK1|BIRC5:CENPA:CDC20:SPC25][TUBA1C|CDC20:SPC25]
FALSE    [NUP133|CENPF:CENPE:SPC24][SRF|TUBA1C][SGO1|TUBA1C:BUB1][CENPU|DSN1:MAD2L1]
FALSE    [CENPH|BIRC5:CENPA:CENPK:SPC25:SPC24:MAD2L1:ZWILCH]
FALSE    [KNTC1|BIRC5:CENPK:NUP85:KNL1:CENPE:DSN1][NUP37|CENPF:CENPH:TUBA1C:ZWILCH:KNTC1]
FALSE    [CENPO|CENPA:NUP85:CENPU:MAD2L1][RHOB|SRF:SGO1][INCENP|CDCA8:KNL1:CENPH:SPC24:MAD2L1:BUB1]
FALSE    [SKA1|BIRC5:SGO1:SPC24:BUB1:ZWILCH][CKAP5|NUP133:KIF18A:DSN1]
FALSE    [NDC80|CENPA:CENPK:SKA1:SPC24:MAD2L1:AURKB][XPO1|CENPO:CENPE:SPC24:SGO2]
FALSE    [NUF2|CENPA:CENPF:SGO1:SKA1:TUBA1C:AURKB][BUB3|NUP37][ERCC6L|CENPI:SRF:SGO1:SKA1:TUBA1C]
FALSE    [CENPQ|NUP37:NDC80:CENPI:MAD2L1][ZWINT|NDC80:SGO1:DSN1][DIAPH3|NDC80:CENPI:SRF:SGO2]
FALSE    [AHCTF1|NDC80:BIRC5:CENPF:CENPE][CENPL|XPO1:CENPO:NUF2:DSN1:AHCTF1:CKAP5]
FALSE    [RCC2|NUP133:CENPI:CENPF:CDCA8:DIAPH3:CENPU:BUB1]
FALSE   nodes:                                 48 
FALSE   arcs:                                  150 
FALSE     undirected arcs:                     0 
FALSE     directed arcs:                       150 
FALSE   average markov blanket size:           12.42 
FALSE   average neighbourhood size:            6.25 
FALSE   average branching factor:              3.12 
FALSE 
FALSE   generation algorithm:                  Empty

The same can be done for bnpathplot.

cl <- parallel::makeCluster(12)
n1 <- bnpathplot(pway, vstedTCGA, nCategory = 15, returnNet=T, R=200, cl=cl)
n2 <- bnpathplot(pway, vstedTCGA, nCategory = 15, returnNet=T, R=200, cl=cl)
n3 <- bnpathplot(pway, vstedTCGA, nCategory = 15, returnNet=T, R=200, cl=cl)
n4 <- bnpathplot(pway, vstedTCGA, nCategory = 15, returnNet=T, R=200, cl=cl)
n5 <- bnpathplot(pway, vstedTCGA, nCategory = 15, returnNet=T, R=200, cl=cl)
nets <- list(n1$av, n2$av, n3$av, n4$av, n5$av)
fvalues <- compareBNs(nets)
mean(fvalues)

FALSE [1] 0.9752001

parallel::stopCluster(cl)

7.2 Testing R

We provide the diagnostic function of inferred network across multiple bootstrap numbers. We can test how R values affect the resulting network. It is very time consuming to test many genes, thus the setting up cl argument using library parallel is suggested. We can specify which scoring function to use by scoreType. Additionally, returned strength data frame as well as raw data frame can be used to assess various metrics and thresholds like structural hamming distance.

cl <- parallel::makeCluster(6)
pathTest <- bnpathtest(results = pway, algo="hc",
           exp = vsted, Rrange = seq(10, 300, 10), nCategory = 15,
           expSample = incSample,
           expRow = "ENSEMBL", cl = cl)

# Plot
difEdges <- sapply(pathTest$graph, function(x) sapply(pathTest$graph, function(y) length(E(difference(as.igraph(x), as.igraph(y))))))
difEdges <- data.frame(difEdges[dim(difEdges)[1], colnames(difEdges)])
colnames(difEdges) <- c("NumDif")
ggplot(difEdges, aes(x=as.numeric(substring(rownames(difEdges), 2)), y=NumDif)) +
  geom_line(group=1)+
  geom_point(aes(color=NumDif), size=5)+
  scale_color_gradient(low = "blue", high = "red", name="Number of different edges", guide = guide_legend())+
  theme_bw()+xlab("R")+ylab("Number of different edges")+
  theme(legend.position="bottom")

parallel::stopCluster(cl)

## Which pathway?
paste(pway@result[15,]$Description, pway@result[15,]$Count)

FALSE [1] "Homologous DNA Pairing and Strand Exchange 22"

## From 10 to 200
cl = parallel::makeCluster(6)
geneTest <- bngenetest(results = pway, algo="hc", pathNum=15,
           exp = vsted, Rrange=seq(10, 200, 10),
           expSample = incSample,
           expRow = "ENSEMBL", cl = cl)

difSHD <- sapply(geneTest$graph, function(x) sapply(geneTest$graph, function(y) shd(x, y)))
# Plot SHD compared to the highest R network
difSHD <- data.frame(difSHD[dim(difSHD)[1], colnames(difSHD)])
colnames(difSHD) <- c("SHD")
ggplot(difSHD, aes(x=as.numeric(substring(rownames(difSHD), 2)), y=SHD)) +
  geom_line(group=1)+
  geom_point(aes(color=SHD), size=5)+
  scale_color_gradient(low = "blue", high = "red", name="SHD", guide = guide_legend())+
  theme_bw()+xlab("R")+ylab("structural hamming distance")+
  theme(legend.position="bottom")

parallel::stopCluster(cl)