A tutorial to guide through the usage for the BayesMP R package. A real data example of the mltiple tissue mouse metabolism data is used. The following parts are included:
The package is available on GitHub page (https://github.com/Caleb-Huo/BayesMP)
To install the package,
library(devtools)
install_github("Caleb-Huo/BayesMP")
The paper is accepted by the annuals of applied statistics.
The pre-print can be found on ArXiv (https://arxiv.org/pdf/1707.03301.pdf)
Note that you may need internet connection to read in the data.
# include necessary packages
library(BayesMP) # Include the BayesMP package
#> Loading required package: cluster
#> Loading required package: qvalue
#> Loading required package: truncnorm
#> Loading required package: invgamma
library(limma) # Will perform differential expression analysis
# read in the data
data_brown <- read.csv("https://bayesmp.github.io/data/mouseMetabolism/data_brown.csv", row.names = 1)
data_heart <- read.csv("https://bayesmp.github.io/data/mouseMetabolism/data_heart.csv", row.names = 1)
data_liver <- read.csv("https://bayesmp.github.io/data/mouseMetabolism/data_liver.csv", row.names = 1)
# Verify gene names match across three tissues
all(rownames(data_brown) == rownames(data_heart))
#> [1] TRUE
all(rownames(data_brown) == rownames(data_liver))
#> [1] TRUE
# Combime these three studies as list
dataExp <- list(brown=data_brown, heart=data_heart, liver=data_liver)
# Check the dimension of the three studies
sapply(dataExp, dim)
#> brown heart liver
#> [1,] 6883 6883 6883
#> [2,] 8 7 8
# Check the head of the three studies
sapply(dataExp, head)
#> $brown
#> b.wt b.wt.1 b.wt.2 b.wt.3 b.LCAD b.LCAD.1 b.LCAD.2
#> Copg1 8.086841 8.047482 8.140015 8.010229 8.206645 8.151032 7.956093
#> Atp6v0d1 9.807054 9.637094 10.044481 9.825333 9.868880 9.667059 9.541244
#> Golga7 9.889406 9.763597 9.884090 9.754937 9.879255 9.932499 10.058944
#> Psph 8.938035 8.980251 8.233583 9.015927 8.270020 8.398283 8.486339
#> Trappc4 8.172845 7.885775 8.426231 8.453419 8.569754 8.671850 8.719143
#> Dpm2 7.699081 7.731294 7.890931 7.794705 7.934195 7.952838 7.860637
#> b.LCAD.3
#> Copg1 8.086200
#> Atp6v0d1 9.581028
#> Golga7 10.006400
#> Psph 8.474880
#> Trappc4 8.419258
#> Dpm2 7.754330
#>
#> $heart
#> h.wt h.wt.1 h.wt.2 h.LCAD h.LCAD.1 h.LCAD.2
#> Copg1 7.859429 7.955171 8.045601 8.145281 8.016827 7.961778
#> Atp6v0d1 9.479398 9.499617 9.571348 9.469063 9.516679 9.437079
#> Golga7 10.032911 9.935121 10.091663 10.030687 10.027411 10.064840
#> Psph 6.641783 6.567673 6.492371 7.089537 7.034313 6.954472
#> Trappc4 7.508498 7.539079 7.519181 7.889955 8.017153 7.926899
#> Dpm2 7.496504 7.495553 7.396052 7.303650 7.444895 7.357132
#> h.LCAD.3
#> Copg1 7.964703
#> Atp6v0d1 9.559526
#> Golga7 10.026984
#> Psph 6.936372
#> Trappc4 8.058787
#> Dpm2 7.565132
#>
#> $liver
#> l.wt l.wt.1 l.wt.2 l.wt.3 l.LCAD l.LCAD.1
#> Copg1 8.501327 8.698994 8.095882 8.519093 8.539002 8.305171
#> Atp6v0d1 9.969806 9.975494 10.000650 10.161694 10.051711 10.084761
#> Golga7 10.354004 10.298948 10.393287 10.613785 10.234465 10.341869
#> Psph 7.116696 7.306718 6.391776 6.761086 7.011396 6.510028
#> Trappc4 6.868989 6.957635 7.048474 7.054501 7.435293 7.440688
#> Dpm2 7.587628 7.622524 7.636579 7.615467 7.799874 7.627128
#> l.LCAD.2 l.LCAD.3
#> Copg1 8.588183 8.554201
#> Atp6v0d1 9.989209 10.035293
#> Golga7 10.244426 10.267127
#> Psph 6.841652 6.485354
#> Trappc4 7.517063 8.482264
#> Dpm2 7.650137 7.783372
# perform differential expression analysis for each of these three tissues.
# Create an empty matrix to store Z value.
# Each row represents a gene and each column represent a study/tissue.
# Note that Z value matrix is the input for BayesMP method.
# Z value can be calculated by inverse CDF of the p-value from differential expression analysis.
# A positive Z value indicates that the gene is upregulated in that study/tissue.
Z <- matrix(0,nrow=nrow(dataExp[[1]]),ncol=length(dataExp))
rownames(Z) <- rownames(dataExp[[1]])
colnames(Z) <- names(dataExp)
for(s in 1:length(dataExp)){
adata <- dataExp[[s]]
ControlLabel = grep('wt',colnames(adata))
caseLabel = grep('LCAD',colnames(adata))
label <- rep(NA, ncol(adata))
label[ControlLabel] = 0
label[caseLabel] = 1
design = model.matrix(~label) # design matrix
fit <- lmFit(adata,design) # fit limma model
fit <- eBayes(fit)
aeffectsize <- fit$coefficients[,2] # get effect sizes
Z[aeffectsize>0,s] <- -qnorm(fit$p.value[aeffectsize>0,2]/2)
Z[aeffectsize<0,s] <- qnorm(fit$p.value[aeffectsize<0,2]/2)
## here we obstain the Z score based on the input p-values and the effect size directions.
## qnorm is the inverse normal CDF.
## Divided by two will convert the two-sided p-value to one-sided p-value
}
head(Z)
#> brown heart liver
#> Copg1 0.3653645 0.9241556 0.3072930
#> Atp6v0d1 -1.4103484 -0.3644899 0.1887445
#> Golga7 1.7723627 0.3020690 -1.5459529
#> Psph -2.0051475 3.8998964 -0.8064148
#> Trappc4 2.2715633 4.0788102 2.4933007
#> Dpm2 1.1710226 -0.5892889 1.3667320
BayesMP will model the alternative distribution via Dirichlet process, which is a non-parametric model and robust again model perturbations.
# WD <- '~/Desktop/'
# setwd(WD) # You can set the working directory here. Some MCMC results will be saved here.
niter <- 1000 # number of iterations
# In real application, niter=10000 is suggested.
burnin <- 200 # number of burnin samples. Note the burnin samples will be disgarded.
# In real application, burnin=500 is suggested.
set.seed(15213) # set random seed
# writeHSall=T will save the intermediate results for the Bayesian hypothesis testing settings.
# writeY=T will save the intermediate results for the posterior samples of Y, which is used as the input for metaPattern detection.
# If you want to track the MCMC samples for other paramters including gamma, Pi, Delta,
# One can save these results by set writeGamma, writePi, writeDelta to be TRUE.
# One can set gamma to be fixed by setting updateGamma = TRUE, if he has a good estimate of initial gamma.
system.time(BayesMP(Z,niter=niter, burnin=burnin, writeY = TRUE, writeHSall=TRUE))
#> performing MCMC, i = 1,2,..., niter (= 1000) [one "." per sample]:
#> user system elapsed
#> 123.788 0.806 127.547
HSallRes <- read.table('BayesMP_HSall.txt') # read in the intermediate results for the Bayesian hypothesis testing
# Omega(B)
HSb_belief <- HSallRes[,1]/(niter - burnin) # Bayesian belief for Omega(B)
HSb_qvalue <- BayesianFDR(HSb_belief) # Bayesian FDR for each gene
sum(HSb_qvalue<0.05)
#> [1] 2047
sum(HSb_qvalue<0.01)
#> [1] 970
# Omega(A)
S <- ncol(Z)
HSa_belief <- HSallRes[,S]/(niter - burnin) # Bayesian belief for Omega(A)
HSa_qvalue <- BayesianFDR(HSa_belief) # Bayesian FDR for each gene
sum(HSa_qvalue<0.05)
#> [1] 162
# Omega(r)
r <- 2
HSr_belief <- HSallRes[,r]/(niter - burnin) # Bayesian belief for Omega(r)
HSr_qvalue <- BayesianFDR(HSr_belief) # Bayesian FDR for each gene
sum(HSr_qvalue<0.05)
#> [1] 875
con <- file('BayesMP_Y.txt', open = "r")
G <- nrow(Z)
S <- ncol(Z)
# the resYplus and resYminus matrices are used to save "latent expression"
resYplus <- matrix(0,G,S)
resYminus <- matrix(0,G,S)
i = 1
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
if(i>burnin){
#print(i)
seven = strsplit(oneLine, "\t")[[1]]
thisY <- matrix(as.numeric(seven),G,S)
# for individual studies
resYplus[thisY>0] <- resYplus[thisY>0] + 1
resYminus[thisY<0] <- resYminus[thisY<0] + 1
}
i = i + 1
}
close(con)
# we only consider HSb_qvalue<0.01 for the meta-pattern detection
resYplus_DE <- resYplus[HSb_qvalue<0.01,]
resYminus_DE <- resYminus[HSb_qvalue<0.01,]
# tight clustering
dissimilarity <- distance(resYplus_DE, resYminus_DE, niter - burnin) # calculate dissimilarity matrix for each pair of genes.
atarget <- 2
# Alternatively, one may try the consensus clustering algorithm to detect gene modules.
# perform the tight clustering to identify meta-pattern modules.
# k.min controls the size of the final meta module.
# A small k.min will result in large module size and a large k.min will result in small module size.
tightClustResult <- tightClustPam(dissimilarity, target=atarget, k.min=20)
#> Number of points: 970 Dimension: 970
#>
#> Looking for tight cluster 1 ...
#> k = 20
#> k = 21
#> 1 tight cluster(s) found!
#> Cluster size: 272 Remaining number of points: 698
#>
#> Looking for tight cluster 2 ...
#> k = 19
#> k = 20
#> 2 tight cluster(s) found!
#> Cluster size: 187 Remaining number of points: 511
for(i in 1:atarget){
clustSelection <- tightClustResult$cluster == i
numS = ncol(resYplus_DE)
barData <- cbind(resYplus_DE,resYminus_DE)[clustSelection,]/(niter - burnin)
barMean <- colMeans(barData)
barSd <- apply(barData,2,sd)
mp <- barplot(barMean, axes=FALSE, axisnames=FALSE, ylim=c(0, 1.2),
col=c(rep('red',numS), rep('blue',numS)), main=paste('target',i,'\n n =',sum(clustSelection)),
xlab="Study", ylab="prosterior probablity")
axis(1, labels=c(paste(1:numS,"+",sep=''), paste(1:numS,"-",sep='')), at = mp)
axis(2, at=seq(0 , 1, by=0.2))
box()
# Plot the vertical lines of the error bars
# The vertical bars are plotted at the midpoints
segments(mp, barMean - barSd, mp, barMean + barSd, lwd=2)
# Now plot the horizontal bounds for the error bars
# 1. The lower bar
segments(mp - 0.1, barMean - barSd, mp + 0.1, barMean - barSd, lwd=2)
# 2. The upper bar
segments(mp - 0.1, barMean + barSd, mp + 0.1, barMean + barSd, lwd=2)
}
for(s in 1:length(dataExp)){
adata <- dataExp[[s]]
aname <- names(dataExp)[s]
bdata <- adata[HSb_qvalue<0.01, ][tightClustResult$cluster == 1 ,]
cdata <- as.matrix(bdata)
ddata <- t(scale(t(cdata))) # standardize the data such that for each gene, the mean is 0 and sd is 1.
ColSideColors <- rep("black", ncol(adata))
ColSideColors[grep('LCAD',colnames(adata))] <- "red"
B <- 16
redGreenColor <- rgb(c(rep(0, B), (0:B)/B), c((B:0)/16, rep(0, B)), rep(0, 2*B+1))
heatmap(ddata,Rowv=NA,ColSideColors=ColSideColors,col= redGreenColor ,scale='none',Colv=NA, main=aname)
}