Chapter 5 Including clinical variables
5.1 Preparation of data
One of the important reasons of the Bayesian network analysis is assessing the relationship between gene expressions and clinical variables. cpbnplot
offers incorporating metadata into inference. As a demonstrative purpose, the enrichment analysis results from GSE133624 is applied on data from The Cancer Genome Atlas Program (TCGA) (Cancer Genome Atlas Research Network et al. 2013). Specifically, TCGA-BLCA data is downloaded by the useful library TCGAbiolinks
(Colaprico et al. 2016).
library(TCGAbiolinks)
## Not run
# query <- GDCquery(project = "TCGA-BLCA",
# data.category = "Transcriptome Profiling",
# data.type = "Gene Expression Quantification",
# workflow.type = "HTSeq - Counts")
# download <- GDCdownload(query)
# tcgaData <- GDCprepare(query)
# save(file="tcgaData.rda", tcgaData)
## Load dataset
load(file="tcgaData.rda")
We again applied VST on the data, and filtered the metadata based on the variables to be included in the inference. In this analysis, age_at_diagnosis
and paper_Combined T and LN category
, which is a sum of Tumor category 1/2 (0) vs. 3/4 (1) and LN negative (0) vs positive (1), are included.
library(DESeq2)
library(dplyr)
<- assays(tcgaData)
dataAssay <- dataAssay@listData$`HTSeq - Counts`
tcgaCount
## Make DESeq2 object
<- DESeqDataSetFromMatrix(countData = tcgaCount,
ddsTCGA colData = tcgaData@colData,
design= ~ 1)
<- assay(vst(ddsTCGA))
vstedTCGA
## Variable selection phase
<- data.frame(tcgaData@colData) %>%
metadata select(age_at_diagnosis, paper_Combined.T.and.LN.category) %>%
na.omit() %>%
filter(paper_Combined.T.and.LN.category!="ND")
## Scale and factorize
$age_at_diagnosis <- as.numeric(scale(metadata$age_at_diagnosis))
metadata$paper_Combined.T.and.LN.category <- as.factor(metadata$paper_Combined.T.and.LN.category) metadata
5.2 Inference of pathway relationship including clinical variables
We assess the relationship between the curated biological pathway information and clinical variables mentioned above. Variables other than expression data can be specified with otherVar
, as well as otherVarName
for their name. The order of otherVar
must be same as column order of expression data. We use all the significant pathways of corrected p-values below 0.05.
<- bnpathplot(pway,
bnCov
vstedTCGA,nCategory = 1000,
adjpCutOff = 0.05,
expSample=rownames(metadata),
algo="hc", strType="normal",
otherVar=metadata,
otherVarName=c("Age", "Category"),
R=50, cl=parallel::makeCluster(10),
returnNet=T)
## Check DAG
::is.dag(as.igraph(bnCov$av)) igraph
FALSE [1] TRUE
## Fit the parameter to network based on the data
<- bn.fit(bnCov$av, bnCov$df) bnFit
Plot the resulting network.
$plot bnCov
5.3 Conditional probability query
Next we perform conditional probability queries by the bnlearn function cpdist
to elucidate how the clinical variables affect pathway regulation. First we fit the inferred network to the original data. These are stored in the named list. Logic sampling is performed unless otherwise stated.
Perform cpdist
, and visualize the distribution of “Molecules associated with elastic fibres” conditional on the tumor category using ggdist
.
library(bnlearn)
library(ggdist)
library(ggplot2)
<- "Molecules associated with elastic fibres"
candPath
<- cpdist(bnFit, nodes=c(candPath), evidence=(Category==0))
efz <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==1))
efo <- cpdist(bnFit, nodes=c(candPath), evidence=(Category==2))
eft
= data.frame(
effect val = c(efz[,1], efo[,1], eft[,1]),
stage = c(rep("0",nrow(efz)), rep("1", nrow(efo)), rep("2", nrow(eft)))
)
<- effect %>% group_by(stage) %>% summarise(mean=mean(val))
disMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
stageWMean $stageLabel <- c(rep(stageWMean[1],nrow(efz)), rep(stageWMean[2], nrow(efo)), rep(stageWMean[3], nrow(eft)))
effect
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candPath)
How the down-regulation in “Cell Cycle Checkpoints” affects the other pathways? This time using the importance sampling method, likelihood weighting.
<- names(bnFit)
predNodes <- predNodes[predNodes != "Cell Cycle Checkpoints"]
predNodes <- max(bnCov$df[candPath])
maxVal <- min(bnCov$df[candPath])
minVal
<- cpdist(bnFit, nodes=predNodes, evidence=list("Cell Cycle Checkpoints"=minVal), method="lw")
lowCCC <- attributes(lowCCC)$weights
lowW <- cpdist(bnFit, nodes=predNodes, evidence=list("Cell Cycle Checkpoints"=maxVal), method="lw")
highCCC <- attributes(highCCC)$weights
highW
## Remove the factor
$Category <- NULL
highCCC$Category <- NULL
lowCCC
<- apply(highCCC, 2, function(x) weighted.mean(x, highW)) - apply(lowCCC, 2, function(x) weighted.mean(x, lowW))
difMeanCCC
## Top absolute value
kable(head(difMeanCCC[order(abs(difMeanCCC), decreasing=TRUE)]), col.names=c("difference"))
difference | |
---|---|
M Phase | 23.43499 |
Mitotic Prometaphase | 21.20258 |
Mitotic Metaphase and Anaphase | 20.87912 |
Mitotic Anaphase | 20.73596 |
Separation of Sister Chromatids | 19.51641 |
Resolution of Sister Chromatid Cohesion | 19.35302 |
## Reflect the difference in the plot modifying ggplot2 object
<- bnCov$plot$data
changeCol <- difMeanCCC[changeCol$name]
difMeanCCC names(difMeanCCC) <- changeCol$name
$color <- difMeanCCC
changeCol
## Replace the color, change the legend
$plot$data <- changeCol
bnCov$plot + scale_color_continuous(low="blue",high="red",name="difference") bnCov
5.4 Gene relationship with variables
For the genes in interesting pathway, clinical variables can be incorporated too. We investigated the genes involved in the reactome “Molecules associated with elastic fibres.”
<- bngeneplot(pway,
bnGeneCov pathNum=43,
vstedTCGA, expSample=rownames(metadata),
otherVar=metadata,
hub=5, R=100,
otherVarName=c("Age","Category"),
cl=parallel::makeCluster(10),
returnNet=T)
Plot the resulting network of genes.
## Plot
$plot bnGeneCov
## Check DAG
::is.dag(as.igraph(bnGeneCov$av)) igraph
FALSE [1] TRUE
## Fit the parameter to network based on the data
<- bn.fit(bnGeneCov$av, bnGeneCov$df) bnFitGene
Perform cpdist
, and examine the mean and distribution using ggdist
. We can see that the expression of the gene EFEMP1, which is reported to be a candidate for a biomarker of aggressive bladder cancer or therapeutic targets (Han et al. 2017), is going up with each stage.
<- "EFEMP1"
candGene <- cpdist(bnFitGene, nodes=c(candGene), evidence=(Category==0))
efz <- cpdist(bnFitGene, nodes=c(candGene), evidence=(Category==1))
efo <- cpdist(bnFitGene, nodes=c(candGene), evidence=(Category==2))
eft
= data.frame(
effect val = c(efz[,1], efo[,1], eft[,1]),
stage = c(rep("0",nrow(efz)), rep("1", nrow(efo)), rep("2", nrow(eft)))
)
<- effect %>% group_by(stage) %>% summarise(mean=mean(val))
disMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
stageWMean $stageLabel <- c(rep(stageWMean[1],nrow(efz)), rep(stageWMean[2], nrow(efo)), rep(stageWMean[3], nrow(eft)))
effect
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candGene)
5.5 Confirming the existing knowledge
To confirm the validity of the inferred Bayesian network, we can focus on some genes that is already validated to be related to clinical information or is incorporated into the daily clinical practice. To obtain the pathways that include the specific gene, one can use obtainPath
function. This time we focus on the gene MMP2, as the gene has been reported to be related to clinical variables in bladder cancer (Vasala, Pääkkö, and Turpeenniemi-Hujanen 2003; Fouad et al. 2019; Winerdal et al. 2018).
<- obtainPath(pway, "MMP2") pathSub
Using the top pathway involving MMP2, construct the network and plot.
<- bngeneplot(pathSub,
bnGeneCov2 pathNum=1,
vstedTCGA, expSample=rownames(metadata),
otherVar=metadata,
hub=5, R=50, algo="hc",
otherVarName=c("Age","Category"),
cl=parallel::makeCluster(10),
returnNet=T)
$plot bnGeneCov2
$av <- cpbnplot:::chooseEdgeDir(bnGeneCov2$av, bnGeneCov2$df, scoreType="mi-cg", debug=FALSE)
bnGeneCov2<- bn.fit(bnGeneCov2$av, bnGeneCov2$df) bnGeneCov2Fit
Predict the distribution.
<- "MMP2"
candGene
<- cpdist(bnGeneCov2Fit, nodes=c(candGene), evidence=(Category==0), method="ls")
mz <- cpdist(bnGeneCov2Fit, nodes=c(candGene), evidence=(Category==1), method="ls")
mo <- cpdist(bnGeneCov2Fit, nodes=c(candGene), evidence=(Category==2), method="ls")
mt
= data.frame(
effect val = c(mz[,1], mo[,1], mt[,1]),
stage = c(rep("0",nrow(mz)), rep("1", nrow(mo)), rep("2", nrow(mt)))
)
<- effect %>% group_by(stage) %>% summarise(mean=mean(val))
disMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
stageWMean $stageLabel <- c(rep(stageWMean[1],nrow(mz)), rep(stageWMean[2], nrow(mo)), rep(stageWMean[3], nrow(mt)))
effect
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candGene)
It is interesting to investigate whether the result is similar in the other database, like Gene Ontology doing the same analysis.
## Perform the same analysis on GO enrichment result
<- obtainPath(pwayGO, "MMP2")
pathSubGO <- bngeneplot(pathSubGO,
bnCovGO
vstedTCGA,pathNum=1,
expSample=rownames(metadata),
otherVar=metadata,
R=50, layout="sugiyama",
otherVarName=c("Age","Category"),
cl=parallel::makeCluster(10),
returnNet=T)
$av <- cpbnplot:::chooseEdgeDir(bnCovGO$av, bnCovGO$df, scoreType="mi-cg", debug=FALSE)
bnCovGO<- bn.fit(bnCovGO$av, bnCovGO$df)
bnGeneCovGOFit
<- cpdist(bnGeneCovGOFit, nodes=c(candGene), evidence=(Category==0), method="ls")
mz <- cpdist(bnGeneCovGOFit, nodes=c(candGene), evidence=(Category==1), method="ls")
mo <- cpdist(bnGeneCovGOFit, nodes=c(candGene), evidence=(Category==2), method="ls")
mt
= data.frame(
effect val = c(mz[,1], mo[,1], mt[,1]),
stage = c(rep("0",nrow(mz)), rep("1", nrow(mo)), rep("2", nrow(mt)))
)
<- effect %>% group_by(stage) %>% summarise(mean=mean(val))
disMean <- paste0(disMean$stage, " (mean=", round(disMean$mean,3), ")")
stageWMean $stageLabel <- c(rep(stageWMean[1],nrow(mz)), rep(stageWMean[2], nrow(mo)), rep(stageWMean[3], nrow(mt)))
effect
ggplot(effect, aes(x=val, y=stage, color=stageLabel, fill=stageLabel)) +
scale_color_manual(values=c("steelblue","gold","tomato")) +
scale_fill_manual(values=c("steelblue","gold","tomato")) +
stat_dotsinterval() + theme_bw() + ggtitle(candGene) + labs(caption=pathSubGO@result$Description[1])
5.6 Investigating the network based on the clinical question
After confirming the knowledge, it is interesting to test how difference in clinical variables affect gene expression. bnlearn
can naturally handle this again using cpdist
. We now include two more variables, age_at_diagnosis
, gender
, paper_Noninvasive.bladder.cancer.therapy
, paper_Combined.T.and.LN.category
. Inference based on these information, we can ask:
Which genes have the biggest expression differences between those treated with BCG and with none, considering the networks of genes and clinical variables of age, gender, tumor category in TCGA-BLCA dataset using genes significantly differed in GSE133624 and involved in curated biological pathways related to MMP2?
## Metadata filtering
<- data.frame(tcgaData@colData) %>%
metadata select(age_at_diagnosis, gender,
paper_Noninvasive.bladder.cancer.therapy,%>%
paper_Combined.T.and.LN.category) na.omit() %>%
filter(paper_Combined.T.and.LN.category!="ND") %>%
filter(paper_Noninvasive.bladder.cancer.therapy!="ND")
$age_at_diagnosis <- as.numeric(scale(metadata$age_at_diagnosis))
metadata$gender <- as.factor(metadata$gender)
metadata$paper_Combined.T.and.LN.category <- as.factor(metadata$paper_Combined.T.and.LN.category)
metadata$paper_Noninvasive.bladder.cancer.therapy <- as.factor(metadata$paper_Noninvasive.bladder.cancer.therapy)
metadata
## Subset to significant pathways, and those related to MMP2
@result <- subset(pway@result, p.adjust<0.05)
pway<- obtainPath(pway, "MMP2")
pathSub
<- bngeneplot(pathSub,
bnCovGene3
vstedTCGA,pathNum = seq_len(nrow(pathSub)),
expSample=rownames(metadata),
otherVar=metadata,
hub=5, R=50,
otherVarName=c("Age","Gender","Therapy","Category"),
cl=parallel::makeCluster(10),
returnNet=T)
$av <- chooseEdgeDir(bnCovGene3$av, bnCovGene3$df, scoreType="mi-cg", debug=FALSE)
bnCovGene3<- bn.fit(bnCovGene3$av, bnCovGene3$df)
bnCovGene3Fit
<- names(bnCovGene3$av$nodes)
allGenes <- allGenes[!allGenes %in% c("Age","Gender","Therapy","Category")]
allGenes <- cpdist(bnCovGene3Fit, nodes=allGenes, evidence=(Therapy=="none"), method="ls")
no <- cpdist(bnCovGene3Fit, nodes=allGenes, evidence=(Therapy=="Bacillus Calmette.Guerin (BCG)"), method="ls")
bcg
<- data.frame(apply(bcg, 2, mean)-apply(no, 2, mean))
difMean $name <- rownames(difMean)
difMeancolnames(difMean) <- c("difference","name")
<- difMean[order(abs(difMean$difference), decreasing=T),]
difMean kable(head(difMean, n=5))
difference | name | |
---|---|---|
PCOLCE | 0.3511844 | PCOLCE |
ADAMTS8 | -0.1952817 | ADAMTS8 |
COL11A1 | 0.1914131 | COL11A1 |
COL25A1 | 0.1877005 | COL25A1 |
ELN | -0.1367145 | ELN |
We can reflect the difference to the plot. In the previous EFEMP1 plot:
<- names(bnFitGene)
candGene <- candGene[candGene != "Category"]
candGene <- cpdist(bnFitGene, nodes=candGene, evidence=(Category==0))
efz2 <- cpdist(bnFitGene, nodes=candGene, evidence=(Category==2))
eft2
<- apply(eft2, 2, mean) - apply(efz2, 2, mean)
difMean
<- bnGeneCov$plot$data
changeCol <- difMean[changeCol$name]
difMean names(difMean) <- changeCol$name
$color <- difMean
changeCol
## Replace shape and color
$shape <- rep(19, dim(bnGeneCov$plot$data)[1])
changeCol$plot$data <- changeCol
bnGeneCov$plot + scale_color_continuous(low="blue",high="red",name="difference") bnGeneCov
5.7 Classification using BN
Inferred BN can be used as a classifier of conditions. In this analysis, we perform the classification of whether the cancer samples are harboring TP53 mutation or not (column paper_mutation in TP53
). First, we make a metadata table as same as the above examples.
<- data.frame(tcgaData@colData) %>%
metadata ::select(age_at_diagnosis, gender, paper_mutation.in.TP53, paper_Combined.T.and.LN.category) %>% na.omit() %>%
dplyrfilter(paper_mutation.in.TP53!="ND") %>%
filter(paper_Combined.T.and.LN.category!="ND")
table(metadata$paper_mutation.in.TP53)
FALSE
FALSE no yes
FALSE 184 163
## Set TP53 status to numeric of 0/1.
$paper_mutation.in.TP53 <- as.numeric(as.factor(metadata$paper_mutation.in.TP53))-1
metadata$age_at_diagnosis <- as.numeric(scale(metadata$age_at_diagnosis))
metadata$paper_Combined.T.and.LN.category <- as.factor(metadata$paper_Combined.T.and.LN.category)
metadata$gender <- as.factor(metadata$gender) metadata
Split the data to train/test according to TP53 mutation status using caret
(Kuhn 2008). In this analysis, the five-fold cross validation is performed. Fit the model using the expression of genes in the pathway. This time the classification performance of significant pathways (corrected p < 1e-7
) are to be compared. onlyDf
option can be enabled to return only the data.frame containing data for prediction, useful for testing purpose.
set.seed(53) # Seed for split
<- caret::createFolds(factor(metadata$paper_mutation.in.TP53), k = 5, list = TRUE, returnTrain=TRUE)
trainIndex <- list() ## Store network in the list
allnets <- list() ## Store prediction in the list
allClassRes
for (f in seq_len(5)) {
<- list()
nets <- list()
classRes <- trainIndex[[f]]
foldTrainIndex ## Recursively fit and test for significant pathways
for (pnum in seq(1, dim(subset(pway@result, p.adjust<1e-5))[1], 1)) {
<- parallel::makeCluster(12)
cl <- bngeneplot(pway, vstedTCGA[,rownames(metadata)][, foldTrainIndex], pathNum=pnum, layout="sugiyama",
bnCovTrain expSample=rownames(metadata[foldTrainIndex,]), algo="hc", strType="normal",
otherVar=metadata[foldTrainIndex,], otherVarName=c("Age","Gender","TP53","Category"),
R=50, cl=cl, returnNet=T)
## Return only DF for testing
<- bngeneplot(pway, vstedTCGA[,rownames(metadata)][, -foldTrainIndex], pathNum=pnum,
bnCovTest expSample=rownames(metadata[-foldTrainIndex,]),
otherVar=metadata[-foldTrainIndex,], otherVarName=c("Age","Gender","TP53","Category"),
onlyDf=T)
$av <- cpbnplot:::chooseEdgeDir(bnCovTrain$av, bnCovTrain$df, scoreType="mi-cg", debug=FALSE)
bnCovTrain
## If DAG and TP53 have parents
if ( igraph::is.dag(bnlearn::as.igraph(bnCovTrain$av)) && length(bnCovTrain$av$nodes$TP53$parents) >= 1 ){
<- bnlearn::bn.fit(bnCovTrain$av, bnCovTrain$df)
bnCovLargeFit <- sigmoid::sigmoid(predict(bnCovLargeFit, node="TP53", data=bnCovTest, method = "bayes-lw")) # Use sigmoid function
pred @result$Description[pnum]]] <- pred
classRes[[pway@result$Description[pnum]]] <- bnCovTrain
nets[[pwayelse {
} message(paste0("Among pathway ", pway@result$Description[pnum], ", no parent node of TP53 is found, or inferred network is not dag."))
}::stopCluster(cl)
parallel
}<- nets
allnets[[f]] <- classRes
allClassRes[[f]] }
Using the library pROC
, calculate the area under ROC (auROC) (Robin et al. 2011).
library(pROC)
library(ggplotify)
<- c()
rocDf <- list()
allRocList for (f in seq_len(5)) {
<- metadata[-trainIndex[[f]],]$paper_mutation.in.TP53
correct <- data.frame(allClassRes[[f]])
predDf $label <- correct
predDf
<- list()
rocList for (i in seq_len(dim(predDf)[2]-1)){
names(predDf)[i]]] <- roc(predDf$label, predDf[,i], ci=TRUE)
rocList[[
}<- data.frame(t(data.frame(purrr::map(rocList, function(x) as.numeric(x$auc)))))
tmpRocDf colnames(tmpRocDf) <- c(paste0("auc",f))
$name <- rownames(tmpRocDf)
tmpRocDf<- tmpRocDf
allRocList[[f]]
}
<- allRocList %>%
allRocListDf ::reduce(left_join, by = "name")
purrr
<- allRocListDf %>%
rocMean rowwise() %>%
mutate(Min = min(c_across(starts_with("auc")), na.rm=T),
Max = max(c_across(starts_with("auc")), na.rm=T),
Mean = mean(c_across(starts_with("auc")), na.rm=T),
Sd = sd(c_across(starts_with("auc")), na.rm=T)) %>%
select(name ,Mean, Sd) %>%
arrange(desc(Mean))
kable(rocMean, row.names=FALSE, booktab=TRUE) %>%
kable_styling(font_size = 10)
name | Mean | Sd |
---|---|---|
Synthesis.of.DNA | 0.7702640 | 0.0483161 |
S.Phase | 0.7564041 | 0.0606541 |
DNA.strand.elongation | 0.7558024 | 0.0515143 |
DNA.Replication | 0.7418214 | 0.0724349 |
Mitotic.Spindle.Checkpoint | 0.7385408 | 0.0384938 |
Mitotic.Anaphase | 0.7359661 | 0.0436257 |
Resolution.of.Sister.Chromatid.Cohesion | 0.7338589 | 0.0296269 |
Mitotic.Metaphase.and.Anaphase | 0.7305106 | 0.0879798 |
Homologous.DNA.Pairing.and.Strand.Exchange | 0.7290159 | 0.0428621 |
Activation.of.ATR.in.response.to.replication.stress | 0.7269121 | 0.0367326 |
Amplification.of.signal.from.the.kinetochores | 0.7195224 | 0.0399094 |
HDR.through.Homologous.Recombination..HRR. | 0.7185543 | 0.0289043 |
RHO.GTPases.Activate.Formins | 0.7082054 | 0.0360879 |
Cell.Cycle.Checkpoints | 0.7065901 | 0.0451219 |
Mitotic.Prometaphase | 0.7053514 | 0.0573109 |
Amplification..of.signal.from.unattached..kinetochores.via.a.MAD2..inhibitory.signal | 0.6991004 | 0.0965263 |
Separation.of.Sister.Chromatids | 0.6986134 | 0.0845527 |
EML4.and.NUDC.in.mitotic.spindle.formation | 0.6749238 | 0.0708973 |
Show the resulting network with the top auROC, and the ROC plot using pROC
(Robin et al. 2011).
<- rocMean[1,"name"]
topPath <- as.numeric(allRocListDf %>% filter(name == as.character(topPath)) %>% summarize(which.max(c_across(starts_with("auc")))))
candFold
## With the CI
<- data.frame(metadata[-trainIndex[[candFold]],]$paper_mutation.in.TP53)
candRoc colnames(candRoc) <- c("label")
$pred <- as.numeric(unlist(allClassRes[[candFold]][gsub("[.]", " ", as.character(topPath))]))
candRoc
<- as.ggplot(function(){
rocplot <- plot.roc(candRoc$label, candRoc$pred, print.auc = TRUE, ci=TRUE, col="black",
rocobj1 main = "Mutation in TP53", percent=TRUE)
<- ci.se(rocobj1, specificities = seq(0, 100, 5))
ciobj plot(ciobj, type = "shape", col = "steelblue")
})
## Along with the network
<- allnets[[candFold]][[gsub("[.]", " ", topPath)]]
topNet $plot + rocplot topNet
Using bnlearn::cpdist
, check the difference in the distribution mean when the value of the node TP53 is above and below 0.5.
<- bnlearn::bn.fit(topNet$av, topNet$df)
topFit
<- names(topNet$av$nodes)
candNodes <- candNodes[!candNodes %in% c("TP53","Category","Gender")]
candNodes
<- cpdist(topFit, nodes=candNodes, evidence=(TP53 < 0.5))
tp53low dim(tp53low)
FALSE [1] 5207 43
<- cpdist(topFit, nodes=candNodes, evidence=(TP53 > 0.5))
tp53high dim(tp53high)
FALSE [1] 4753 43
<- apply(tp53high, 2, mean) - apply(tp53low, 2, mean)
difMeanTp53 kable(head(difMeanTp53[order(abs(difMeanTp53), decreasing=T)]), col.names=c("difference"))
difference | |
---|---|
MCM2 | 0.4732629 |
RFC4 | 0.4666554 |
ORC1 | 0.4331116 |
MCM8 | 0.4323734 |
GINS1 | 0.4120761 |
CCNE2 | 0.3795090 |
<- topNet$plot$data
changeCol <- difMeanTp53[changeCol$name]
difMeanTp53 names(difMeanTp53) <- changeCol$name
$color <- difMeanTp53
changeCol
## Replace shape and color
$plot$data <- changeCol
topNet$plot + scale_color_continuous(low="blue",high="red",name="difference") topNet
When the TP53 takes the extreme values on logic sampling.
<- bnlearn::bn.fit(topNet$av, topNet$df)
topFit
<- names(topNet$av$nodes)
candNodes <- candNodes[!candNodes %in% c("TP53","Category","Gender")]
candNodes
<- cpdist(topFit, nodes=candNodes, evidence=(TP53 < 0.01))
tp53low dim(tp53low)
FALSE [1] 1814 43
<- cpdist(topFit, nodes=candNodes, evidence=(TP53 > 0.99))
tp53high dim(tp53high)
FALSE [1] 1448 43
<- apply(tp53high, 2, mean) - apply(tp53low, 2, mean)
difMeanTp53 kable(head(difMeanTp53[order(abs(difMeanTp53), decreasing=T)]), col.names=c("difference"))
difference | |
---|---|
MCM2 | 0.9259180 |
RFC4 | 0.8718847 |
ORC1 | 0.8373246 |
GINS1 | 0.8137102 |
MCM8 | 0.8044904 |
PRIM1 | 0.7336477 |
<- topNet$plot$data
changeCol <- difMeanTp53[changeCol$name]
difMeanTp53 names(difMeanTp53) <- changeCol$name
$color <- difMeanTp53
changeCol
## Replace shape and color
$plot$data <- changeCol
topNet$plot + scale_color_continuous(low="blue",high="red",name="difference") topNet
Perform likelihood weighting.
<- cpdist(topFit, nodes=candNodes, evidence=list(TP53 = 0), method="lw")
tp53low <- cpdist(topFit, nodes=candNodes, evidence=list(TP53 = 1), method="lw")
tp53high
<- apply(tp53high, 2, function(x) weighted.mean(x, attributes(tp53high)$weights)) - apply(tp53low, 2, function(x) weighted.mean(x, attributes(tp53low)$weights))
difMeanTp53
<- topNet$plot$data
changeCol <- difMeanTp53[changeCol$name]
difMeanTp53
kable(head(difMeanTp53[order(abs(difMeanTp53), decreasing=T)]), col.names=c("difference"))
difference | |
---|---|
MCM2 | 0.5981190 |
RFC4 | 0.5856990 |
ORC1 | 0.5484684 |
GINS1 | 0.5112619 |
MCM8 | 0.5072800 |
CCNA2 | 0.4784021 |
names(difMeanTp53) <- changeCol$name
$color <- difMeanTp53
changeCol
## Replace shape and color
$plot$data <- changeCol
topNet$plot + scale_color_continuous(low="blue",high="red",name="difference") topNet
queryCpDistLw
function performs sampling by likelihood weighting using cpdist, and returns the data.frame with weights. It just performs cpdist
on the queried level and produce a plot. queryCpDistLs
performs logic sampling.
## Only likelihood weighting is supported in queryCpDist.
<- queryCpDistLw(topFit, names(difMeanTp53)[1], evidence="TP53", level=c(0,0.5,1))
q1 kable(head(q1$df[,c(names(difMeanTp53)[1],"weights")]))
RFC2 | weights |
---|---|
10.99188 | 0.8078536 |
10.16224 | 0.9962997 |
11.63254 | 0.1592045 |
10.83047 | 0.9999093 |
11.16883 | 0.9024512 |
10.14470 | 0.7950615 |