1 Introduction

A tutorial to guide through the usage for the BayesMP R package. A real data example of the mltiple tissue mouse metabolism data is used. The following parts are included:

2 How to install the package

The package is available on GitHub page (https://github.com/Caleb-Huo/BayesMP)

To install the package,

library(devtools)
install_github("Caleb-Huo/BayesMP")

3 How to cite the package

The paper is accepted by the annuals of applied statistics.

The pre-print can be found on ArXiv (https://arxiv.org/pdf/1707.03301.pdf)

4 How to prepare the input for BayesMP

Note that you may need internet connection to read in the data.

4.1 Include packages

# include necessary packages
library(BayesMP) # Include the BayesMP package
#> Loading required package: cluster
#> Loading required package: qvalue
#> Loading required package: truncnorm
#> Loading required package: invgamma
library(limma) # Will perform differential expression analysis

4.2 Read in data

# read in the data

data_brown <- read.csv("https://bayesmp.github.io/data/mouseMetabolism/data_brown.csv", row.names = 1)
data_heart <- read.csv("https://bayesmp.github.io/data/mouseMetabolism/data_heart.csv", row.names = 1)
data_liver <- read.csv("https://bayesmp.github.io/data/mouseMetabolism/data_liver.csv", row.names = 1)

# Verify gene names match across three tissues
all(rownames(data_brown) == rownames(data_heart))
#> [1] TRUE
all(rownames(data_brown) == rownames(data_liver))
#> [1] TRUE

# Combime these three studies as list
dataExp <- list(brown=data_brown, heart=data_heart, liver=data_liver)

# Check the dimension of the three studies
sapply(dataExp, dim)
#>      brown heart liver
#> [1,]  6883  6883  6883
#> [2,]     8     7     8

# Check the head of the three studies
sapply(dataExp, head)
#> $brown
#>              b.wt   b.wt.1    b.wt.2   b.wt.3   b.LCAD b.LCAD.1  b.LCAD.2
#> Copg1    8.086841 8.047482  8.140015 8.010229 8.206645 8.151032  7.956093
#> Atp6v0d1 9.807054 9.637094 10.044481 9.825333 9.868880 9.667059  9.541244
#> Golga7   9.889406 9.763597  9.884090 9.754937 9.879255 9.932499 10.058944
#> Psph     8.938035 8.980251  8.233583 9.015927 8.270020 8.398283  8.486339
#> Trappc4  8.172845 7.885775  8.426231 8.453419 8.569754 8.671850  8.719143
#> Dpm2     7.699081 7.731294  7.890931 7.794705 7.934195 7.952838  7.860637
#>           b.LCAD.3
#> Copg1     8.086200
#> Atp6v0d1  9.581028
#> Golga7   10.006400
#> Psph      8.474880
#> Trappc4   8.419258
#> Dpm2      7.754330
#> 
#> $heart
#>               h.wt   h.wt.1    h.wt.2    h.LCAD  h.LCAD.1  h.LCAD.2
#> Copg1     7.859429 7.955171  8.045601  8.145281  8.016827  7.961778
#> Atp6v0d1  9.479398 9.499617  9.571348  9.469063  9.516679  9.437079
#> Golga7   10.032911 9.935121 10.091663 10.030687 10.027411 10.064840
#> Psph      6.641783 6.567673  6.492371  7.089537  7.034313  6.954472
#> Trappc4   7.508498 7.539079  7.519181  7.889955  8.017153  7.926899
#> Dpm2      7.496504 7.495553  7.396052  7.303650  7.444895  7.357132
#>           h.LCAD.3
#> Copg1     7.964703
#> Atp6v0d1  9.559526
#> Golga7   10.026984
#> Psph      6.936372
#> Trappc4   8.058787
#> Dpm2      7.565132
#> 
#> $liver
#>               l.wt    l.wt.1    l.wt.2    l.wt.3    l.LCAD  l.LCAD.1
#> Copg1     8.501327  8.698994  8.095882  8.519093  8.539002  8.305171
#> Atp6v0d1  9.969806  9.975494 10.000650 10.161694 10.051711 10.084761
#> Golga7   10.354004 10.298948 10.393287 10.613785 10.234465 10.341869
#> Psph      7.116696  7.306718  6.391776  6.761086  7.011396  6.510028
#> Trappc4   6.868989  6.957635  7.048474  7.054501  7.435293  7.440688
#> Dpm2      7.587628  7.622524  7.636579  7.615467  7.799874  7.627128
#>           l.LCAD.2  l.LCAD.3
#> Copg1     8.588183  8.554201
#> Atp6v0d1  9.989209 10.035293
#> Golga7   10.244426 10.267127
#> Psph      6.841652  6.485354
#> Trappc4   7.517063  8.482264
#> Dpm2      7.650137  7.783372

# perform differential expression analysis for each of these three tissues.

# Create an empty matrix to store Z value. 
# Each row represents a gene and each column represent a study/tissue. 
# Note that Z value matrix is the input for BayesMP method. 
# Z value can be calculated by inverse CDF of the p-value from differential expression analysis. 
# A positive Z value indicates that the gene is upregulated in that study/tissue.

Z <- matrix(0,nrow=nrow(dataExp[[1]]),ncol=length(dataExp))
rownames(Z) <- rownames(dataExp[[1]])
colnames(Z) <- names(dataExp)

4.3 Perform differential expression analysis in each study

for(s in 1:length(dataExp)){
    adata <- dataExp[[s]]
    ControlLabel = grep('wt',colnames(adata))
    caseLabel = grep('LCAD',colnames(adata))
    label <- rep(NA, ncol(adata))
    label[ControlLabel] = 0
    label[caseLabel] = 1
    
    design = model.matrix(~label)   # design matrix
    fit <- lmFit(adata,design)      # fit limma model
    fit <- eBayes(fit)      
    
    aeffectsize <- fit$coefficients[,2] # get effect sizes
    Z[aeffectsize>0,s] <- -qnorm(fit$p.value[aeffectsize>0,2]/2)
    Z[aeffectsize<0,s] <- qnorm(fit$p.value[aeffectsize<0,2]/2)
    ## here we obstain the Z score based on the input p-values and the effect size directions.
    ## qnorm is the inverse normal CDF.
    ## Divided by two will convert the two-sided p-value to one-sided p-value
    
}

head(Z)
#>               brown      heart      liver
#> Copg1     0.3653645  0.9241556  0.3072930
#> Atp6v0d1 -1.4103484 -0.3644899  0.1887445
#> Golga7    1.7723627  0.3020690 -1.5459529
#> Psph     -2.0051475  3.8998964 -0.8064148
#> Trappc4   2.2715633  4.0788102  2.4933007
#> Dpm2      1.1710226 -0.5892889  1.3667320

5 BayesMP modeling via MCMC

BayesMP will model the alternative distribution via Dirichlet process, which is a non-parametric model and robust again model perturbations.

# WD <- '~/Desktop/'
# setwd(WD) # You can set the working directory here. Some MCMC results will be saved here.

niter <- 1000 # number of iterations
# In real application, niter=10000 is suggested.
burnin <- 200 # number of burnin samples. Note the burnin samples will be disgarded.
# In real application, burnin=500 is suggested.

set.seed(15213) # set random seed

# writeHSall=T will save the intermediate results for the Bayesian hypothesis testing settings.
# writeY=T will save the intermediate results for the posterior samples of Y, which is used as the input for metaPattern detection.
# If you want to track the MCMC samples for other paramters including gamma, Pi, Delta, 
# One can save these results by set writeGamma, writePi, writeDelta to be TRUE.
# One can set gamma to be fixed by setting updateGamma = TRUE, if he has a good estimate of initial gamma.

system.time(BayesMP(Z,niter=niter, burnin=burnin, writeY = TRUE, writeHSall=TRUE))
#> performing MCMC, i = 1,2,..., niter (= 1000)  [one "." per sample]:
#>    user  system elapsed 
#> 123.788   0.806 127.547

6 Task I, meta analysis. There are three hypothesis testing settings.

HSallRes <- read.table('BayesMP_HSall.txt') # read in the intermediate results for the Bayesian hypothesis testing
 
# Omega(B)
HSb_belief <- HSallRes[,1]/(niter - burnin) # Bayesian belief for Omega(B)
HSb_qvalue <- BayesianFDR(HSb_belief) # Bayesian FDR for each gene
sum(HSb_qvalue<0.05)
#> [1] 2047
sum(HSb_qvalue<0.01)
#> [1] 970


# Omega(A)
S <- ncol(Z)
HSa_belief <- HSallRes[,S]/(niter - burnin) # Bayesian belief for Omega(A)
HSa_qvalue <- BayesianFDR(HSa_belief) # Bayesian FDR for each gene
sum(HSa_qvalue<0.05)
#> [1] 162

# Omega(r)
r <- 2
HSr_belief <- HSallRes[,r]/(niter - burnin) # Bayesian belief for Omega(r)
HSr_qvalue <- BayesianFDR(HSr_belief) # Bayesian FDR for each gene
sum(HSr_qvalue<0.05)
#> [1] 875

7 Task II, differential expression meta-pattern (MetaPattern)

7.1 Read in MCMC posterior samples

con  <- file('BayesMP_Y.txt', open = "r")

G <- nrow(Z)
S <- ncol(Z)

# the resYplus and resYminus matrices are used to save "latent expression"
resYplus <- matrix(0,G,S)
resYminus <- matrix(0,G,S)


i = 1
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
  if(i>burnin){
      #print(i)
      seven = strsplit(oneLine, "\t")[[1]]
      thisY <- matrix(as.numeric(seven),G,S)
    
      # for individual studies
      resYplus[thisY>0] <- resYplus[thisY>0] + 1
      resYminus[thisY<0] <- resYminus[thisY<0] + 1
  }    
  i = i + 1
} 

close(con)

7.2 Detect meta pattern. We use the tight clustering algorithm to detect metaPatterns. In fact, one can also use other clustering algorithm to detect gene clusters

# we only consider HSb_qvalue<0.01 for the meta-pattern detection
resYplus_DE <- resYplus[HSb_qvalue<0.01,]
resYminus_DE <- resYminus[HSb_qvalue<0.01,]

# tight clustering
dissimilarity <- distance(resYplus_DE, resYminus_DE, niter - burnin) # calculate dissimilarity matrix for each pair of genes.
atarget <- 2

# Alternatively, one may try the consensus clustering algorithm to detect gene modules.
# perform the tight clustering to identify meta-pattern modules. 
# k.min controls the size of the final meta module. 
# A small k.min will result in large module size and a large k.min will result in small module size.
tightClustResult <- tightClustPam(dissimilarity, target=atarget, k.min=20) 
#> Number of points: 970    Dimension: 970 
#> 
#> Looking for tight cluster 1 ...
#> k = 20
#> k = 21
#> 1 tight cluster(s) found!
#> Cluster size: 272    Remaining number of points: 698 
#> 
#> Looking for tight cluster 2 ...
#> k = 19
#> k = 20
#> 2 tight cluster(s) found!
#> Cluster size: 187    Remaining number of points: 511

7.3 Visualization

7.3.1 visualize the posterior probablity


for(i in 1:atarget){
    clustSelection <- tightClustResult$cluster == i
    numS = ncol(resYplus_DE)
    barData <- cbind(resYplus_DE,resYminus_DE)[clustSelection,]/(niter - burnin)
    barMean <- colMeans(barData)
    barSd <- apply(barData,2,sd)

    mp <- barplot(barMean, axes=FALSE, axisnames=FALSE, ylim=c(0, 1.2),
                  col=c(rep('red',numS), rep('blue',numS)), main=paste('target',i,'\n n =',sum(clustSelection)), 
                  xlab="Study", ylab="prosterior probablity")
    axis(1, labels=c(paste(1:numS,"+",sep=''), paste(1:numS,"-",sep='')), at = mp)
    axis(2, at=seq(0 , 1, by=0.2))
     
    box()

    # Plot the vertical lines of the error bars
    # The vertical bars are plotted at the midpoints
    segments(mp, barMean - barSd, mp, barMean + barSd, lwd=2)
    # Now plot the horizontal bounds for the error bars
    # 1. The lower bar
    segments(mp - 0.1, barMean - barSd, mp + 0.1, barMean - barSd, lwd=2)
    # 2. The upper bar
    segments(mp - 0.1, barMean + barSd, mp + 0.1, barMean + barSd, lwd=2)
}

7.3.2 Visualize the heatmap of the first metaPattern module


for(s in 1:length(dataExp)){
    adata <- dataExp[[s]]
    aname <- names(dataExp)[s]
    bdata <- adata[HSb_qvalue<0.01, ][tightClustResult$cluster == 1 ,]
    cdata <- as.matrix(bdata)
    ddata <- t(scale(t(cdata))) # standardize the data such that for each gene, the mean is 0 and sd is 1.

    ColSideColors <- rep("black", ncol(adata))
    ColSideColors[grep('LCAD',colnames(adata))] <- "red"
    
    B <- 16
  redGreenColor <- rgb(c(rep(0, B), (0:B)/B), c((B:0)/16, rep(0, B)), rep(0, 2*B+1))
    heatmap(ddata,Rowv=NA,ColSideColors=ColSideColors,col= redGreenColor ,scale='none',Colv=NA, main=aname)
}