Este template é baseado no livro do Kruschke Doing Bayesian data analysis:
É necessário a versão actual do OpenBUGS
library(BRugs)
## Welcome to BRugs connected to OpenBUGS version 3.2.3
#------------------------------------------------------------------------------
# THE MODEL.
# Specify the model in BUGS language, but save it as a string in R:
modelString = "
# BUGS model specification begins ...
model {
# Likelihood:
for ( i in 1:nFlips ) {
y[i] ~ dbern( theta )
}
# Prior distribution:
theta ~ dbeta( priorA , priorB )
priorA <- 1
priorB <- 1
}
# ... BUGS model specification ends.
" # close quote to end modelString
# Write the modelString to a file, using R commands:
writeLines(modelString,con="model.txt")
# Use BRugs to send the model.txt file to BUGS, which checks the model syntax:
modelCheck( "model.txt" )
## model is syntactically correct
#------------------------------------------------------------------------------
# THE DATA.
# Specify the data in R, using a list format compatible with BUGS:
dataList = list(
nFlips = 14 ,
y = c( 1,1,1,1,1,1,1,1,1,1,1,0,0,0 )
)
# Use BRugs commands to put the data into a file and ship the file to BUGS:
modelData( bugsData( dataList ) )
## data loaded
#------------------------------------------------------------------------------
# INTIALIZE THE CHAIN.
modelCompile() # BRugs command tells BUGS to compile the model.
## model compiled
modelGenInits() # BRugs command tells BUGS to randomly initialize a chain.
## initial values generated, model initialized
#------------------------------------------------------------------------------
# RUN THE CHAINS.
# BRugs tells BUGS to keep a record of the sampled "theta" values:
samplesSet( "theta" )
## monitor set for variable 'theta'
# R command defines a new variable that specifies an arbitrary chain length:
chainLength = 10000
# BRugs tells BUGS to generate a MCMC chain:
modelUpdate( chainLength )
## 10000 updates took 0 s
#------------------------------------------------------------------------------
# EXAMINE THE RESULTS.
thetaSample = samplesSample( "theta" ) # BRugs asks BUGS for the sample values.
thetaSummary = samplesStats( "theta" ) # BRugs asks BUGS for summary statistics.
thetaSummary
## mean sd MC_error val2.5pc median val97.5pc start sample
## theta 0.7503 0.1043 0.001108 0.5226 0.7607 0.918 1 10000
hist(thetaSample)
samplesHistory("theta", mfrow = c(1, 1))
samplesDensity("theta", mfrow = c(1, 1))
samplesAutoC("theta", 1, mfrow = c(1, 1))
modelString = "
model {
for (i in 1:N) {
r[i] ~ dbin(p[i], n[i])
p[i] ~ dbeta(1,1)
}
}
"
writeLines(modelString,con="modelo_ex1.txt")
modelCheck( "modelo_ex1.txt" )
## model is syntactically correct
########################################################
# Load Data
dataList = list(
N=12,
n=c(47,148,119,810,211,196,148,215,207,97,256,360),
r=c(0,18,8,46,8,13,9,31,14,8,29,24)
)
modelData( bugsData( dataList ) )
## data loaded
modelCompile( numChains = 2 ) # we will use two chains
## model compiled
########################################################
# Load Initialization
initData = list ( # in this case we have two chains
c(0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5),
c(0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1)
)
genInitFactory <- function() {
i <- 0
function() {
i <<- i + 1
list( # \
p=initData[[i]] # | adapt to your specific exercise
) # /
}
}
genInit <- genInitFactory()
for (chainIdx in 1:length(initData)) {
modelInits( bugsInits( genInit ) )
}
## Initializing chain 1:
## initial values loaded and chain initialized but another chain contain uninitialized variables
## Initializing chain 2:
## model is initialized
########################################################
burn.in = 1000 # burn the first iterations
modelUpdate( burn.in )
## 1000 updates took 0 s
# Keep track of:
samplesSet( "p" )
## monitor set for variable 'p'
# Run the chain:
chainLength = 1000
modelUpdate( chainLength, thin = 10 )
## 1000 updates took 0 s
########################################################
# Check results:
N=12
pSample = NULL
pSummary = NULL
for (i in 1:N) {
pSample <- rbind( pSample, samplesSample( paste0("p[",i,"]") ) )
pSummary <- rbind( pSummary, samplesStats( paste0("p[",i,"]") ) )
}
pSummary
## mean sd MC_error val2.5pc median val97.5pc start sample
## p[1] 0.02049 0.019700 0.0004593 0.0005119 0.01488 0.07276 1001 2000
## p[2] 0.12590 0.027240 0.0005839 0.0777600 0.12480 0.18150 1001 2000
## p[3] 0.07466 0.023010 0.0005596 0.0357800 0.07264 0.12670 1001 2000
## p[4] 0.05773 0.007985 0.0001821 0.0432400 0.05767 0.07440 1001 2000
## p[5] 0.04233 0.013900 0.0003100 0.0192500 0.04099 0.07277 1001 2000
## p[6] 0.07077 0.018290 0.0003933 0.0389200 0.06959 0.10990 1001 2000
## p[7] 0.06707 0.020310 0.0003697 0.0326500 0.06538 0.11290 1001 2000
## p[8] 0.14750 0.024020 0.0005492 0.1014000 0.14630 0.19840 1001 2000
## p[9] 0.07204 0.017960 0.0003779 0.0399900 0.07086 0.11070 1001 2000
## p[10] 0.09047 0.028250 0.0006204 0.0423800 0.08834 0.15500 1001 2000
## p[11] 0.11620 0.019940 0.0004339 0.0788200 0.11620 0.15600 1001 2000
## p[12] 0.06935 0.013200 0.0002657 0.0459000 0.06868 0.09824 1001 2000
samplesHistory("*", mfrow = c(4, 3))
Segue um boxplot da simulaçao de cada um dos hospitais:
boxplot(pSample, use.cols=FALSE, xlab="Hospital", ylab="% morte")
abline(h=mean(pSummary$mean), col="red", lwd=1)
modelString = "
# y[i] quantidade de oxigenio para o i-esimo paciente
# x[i] freq. cardiaca para o i-esimo paciente
model {
# verosimilhança
for (i in 1:N) {
y[i] ~ dnorm(mu[i], tau)
mu[i] <- alpha + beta * (x[i] - xbar) # retirar a média ajuda nas auto-correlações
}
# distribuições à priori
tau ~ dgamma(1.0E-6, 1.0E-6)
sigma <- 1/tau
alpha ~ dnorm(0.0, 1.0E-6)
beta ~ dnorm(0.0, 1.0E-6)
alpha0 <- alpha - xbar*beta # repor o ajuste da média: esta é variável que temos de monitorar
}
"
writeLines(modelString,con="modelo_ex2.txt")
modelCheck( "modelo_ex2.txt" )
## model is syntactically correct
########################################################
# Load Data
dataList = list(
x = c(94,96,94,95,104,106,108,113,115,121,131),
xbar = 107,
y = c(0.47,0.75,0.83,0.98,1.18,1.29,1.4,1.6,1.75,1.9,2.23),
N = 11
)
modelData( bugsData( dataList ) )
## data loaded
modelCompile( numChains = 2 ) # we will use two chains
## model compiled
########################################################
# Load Initialization
alphas <- c(0.0, 10.0) # values for each chain
betas <- c(0.0, 20.0)
taus <- c(1.0, 10.0)
genInitFactory <- function() {
i <- 0
function() {
i <<- i + 1
list(
alpha = alphas[i],
beta = betas[i],
tau = taus[i]
)
}
}
genInit <- genInitFactory()
for (chainIdx in 1:length(initData)) {
modelInits( bugsInits( genInit ) )
}
## Initializing chain 1:
## initial values loaded and chain initialized but another chain contain uninitialized variables
## Initializing chain 2:
## model is initialized
########################################################
burn.in = 1000 # burn the first iterations
modelUpdate( burn.in )
## 1000 updates took 0 s
# Keep track of:
samplesSet( c("alpha0", "beta", "sigma") )
## monitor set for variable 'alpha0'
## monitor set for variable 'beta'
## monitor set for variable 'sigma'
# Run the chain:
chainLength = 1000
modelUpdate( chainLength, thin = 10 )
## 1000 updates took 0 s
########################################################
# Check results:
sample.alpha <- samplesSample("alpha0")
stats.alpha <- samplesStats("alpha0")
stats.alpha
## mean sd MC_error val2.5pc median val97.5pc start sample
## alpha0 -3.277 0.4088 0.007766 -4.091 -3.276 -2.471 1001 2000
sample.beta <- samplesSample("beta")
stats.beta <- samplesStats("beta")
stats.beta
## mean sd MC_error val2.5pc median val97.5pc start sample
## beta 0.04283 0.003788 7.42e-05 0.03527 0.0428 0.05036 1001 2000
sample.sigma <- samplesSample("sigma")
stats.sigma <- samplesStats("sigma")
stats.sigma
## mean sd MC_error val2.5pc median val97.5pc start sample
## sigma 0.0221 0.01442 0.0003127 0.008145 0.01844 0.05776 1001 2000
samplesHistory("*", mfrow = c(2, 2))
samplesAutoC("*", 1, mfrow = c(2, 2))
samplesAutoC("*", 2, mfrow = c(2, 2))
O resultado pedido - as distribuições à posteriori do \(\alpha\), \(\beta\) e \(\sigma^2\):
samplesDensity("*", mfrow = c(2, 2))
Podemos estimar os valores dos parâmetros:
library(MASS)
sigma.params <- fitdistr(sample.sigma, "Gamma")
sigma.params$estimate['shape']
## shape
## 3.884742
sigma.params$estimate['rate']
## rate
## 175.7624
Isto resulta numa distribuição \(\sigma^2|x \sim\) Gamma(3.88,175.8)
xs <- seq(0.001,0.3,0.001)
ys <- dgamma(xs,sigma.params$estimate['shape'], sigma.params$estimate['rate'])
plot(xs,ys,type="l")
Façamos o mesmo para \(\alpha\) e \(\beta\):
alpha.params <- fitdistr(sample.alpha, "normal")
alpha.params$estimate['mean']
## mean
## -3.27685
alpha.params$estimate['sd']
## sd
## 0.4087547
beta.params <- fitdistr(sample.beta, "normal")
beta.params$estimate['mean']
## mean
## 0.04283244
beta.params$estimate['sd']
## sd
## 0.003788456
Assim, \(\alpha|x \sim\) N(-3.28,0.409\(^2\)) e \(\beta|x \sim\) N(0.0428,0.00379\(^2\)).
3a) Uso de distribuições vagas para \(\lambda_C\) e \(\lambda_E\):
modelString = "
model {
for(i in 1:N) {
cont[i] ~ dpois(lambda.c)
exp[i] ~ dpois(lambda.e)
}
#prioris
lambda.c ~ dgamma(0.001,0.001) # c = grupo controlo
lambda.e ~ dgamma(0.001,0.001) # e = grupo experimental
delta <- lambda.c - lambda.e
}
"
writeLines(modelString,con="modelo_ex3.txt")
modelCheck( "modelo_ex3.txt" )
## model is syntactically correct
########################################################
# Load Data
dataList = list(
cont = rep(0:7,c(138,77,46,12,8,5,0,2)),
exp = rep(0:7,c(147,83,37,13,3,2,2,1)),
N = 288
)
modelData( bugsData( dataList ) )
## data loaded
modelCompile()
## model compiled
########################################################
# Load Initialization
genInitFactory <- function() {
i <- 0
function() {
i <<- i + 1
list(
lambda.c = 1.0,
lambda.e = 1.0
)
}
}
genInit <- genInitFactory()
modelInits( bugsInits( genInit ) )
## Initializing chain 1:
## model is initialized
########################################################
burn.in = 1000 # burn the first iterations
modelUpdate( burn.in )
## 1000 updates took 0 s
# Keep track of:
samplesSet( "delta" )
## monitor set for variable 'delta'
# Run the chain:
chainLength = 10000
modelUpdate( chainLength, thin = 10 )
## 10000 updates took 0 s
########################################################
# Check results:
sample.delta <- samplesSample("delta")
stats.delta <- samplesStats("delta")
stats.delta
## mean sd MC_error val2.5pc median val97.5pc start sample
## delta 0.1346 0.07726 0.00068 -0.01836 0.1338 0.2895 1001 10000
samplesDensity("delta", mfrow = c(1, 1))
A cadeia delta corresponde à diferença de duas gamas que não parece pertencer a uma família conhecida.
Mas podemos usar o pacote coda
e encontrar o HPD a 95% e verificar se o zero está lá dentro:
library(coda)
HPDinterval(as.mcmc(sample.delta), prob=0.95)
## lower upper
## var1 -0.010608 0.295946
## attr(,"Probability")
## [1] 0.95
Aqui reparamos que o zero pertence de facto ao intervalo de credibilidade. Não podemos concluir para este grau de confiança que a expeiência é significativa.
3b) Uso de distribuições conjudagas para \(\lambda_C\) e \(\lambda_E\):
modelString = "
model {
for(i in 1:N) {
cont[i] ~ dpois(lambda.c)
exp[i] ~ dpois(lambda.e)
}
#prioris
lambda.c ~ dexp(1) # c = grupo controlo
lambda.e ~ dexp(1) # e = grupo experimental
delta <- lambda.c - lambda.e
}
"
writeLines(modelString,con="modelo_ex3.txt")
modelCheck( "modelo_ex3.txt" )
## model is syntactically correct
########################################################
# Load Data
dataList = list(
cont = rep(0:7,c(138,77,46,12,8,5,0,2)),
exp = rep(0:7,c(147,83,37,13,3,2,2,1)),
N = 288
)
modelData( bugsData( dataList ) )
## data loaded
modelCompile()
## model compiled
########################################################
# Load Initialization
genInitFactory <- function() {
i <- 0
function() {
i <<- i + 1
list(
lambda.c = 1.0,
lambda.e = 1.0
)
}
}
genInit <- genInitFactory()
modelInits( bugsInits( genInit ) )
## Initializing chain 1:
## model is initialized
########################################################
burn.in = 1000 # burn the first iterations
modelUpdate( burn.in )
## 1000 updates took 0 s
# Keep track of:
samplesSet( "delta" )
## monitor set for variable 'delta'
# Run the chain:
chainLength = 10000
modelUpdate( chainLength, thin = 10 )
## 10000 updates took 0 s
########################################################
# Check results:
sample.delta <- samplesSample("delta")
stats.delta <- samplesStats("delta")
stats.delta
## mean sd MC_error val2.5pc median val97.5pc start sample
## delta 0.1349 0.0777 0.0007389 -0.01791 0.1344 0.289 1001 10000
samplesDensity("delta", mfrow = c(1, 1))
delta.chain <- as.mcmc(sample.delta)
HPDinterval(delta.chain, prob=0.95)
## lower upper
## var1 -0.016698 0.290037
## attr(,"Probability")
## [1] 0.95
O mesmo ocorre com as distribuições conjugadas à priori. Não podemos ter 95% de confiança que a experiência seja significativa.
# other plots from coda package
plot(delta.chain)
densplot(delta.chain) # another way to present the density
acfplot(delta.chain)
modelString = "
model {
for(i in 1:N) {
r[i] ~ dbin(theta[i], n[i])
logit(theta[i]) <- alpha + beta * (x[i] - xbar)
}
#prioris
alpha ~ dbeta(1,1)
beta ~ dbeta(1,1)
alpha0 <- alpha - xbar*beta
}
"
writeLines(modelString,con="modelo_ex4.txt")
modelCheck( "modelo_ex4.txt" )
## model is syntactically correct
########################################################
# Load Data
dataList = list(
n = c(59,60,62,56,63,59,62,60),
r = c(6,13,18,28,52,53,61,60),
x = c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839),
xbar = mean(c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839)),
N = 8
)
modelData( bugsData( dataList ) )
## data loaded
modelCompile()
## model compiled
########################################################
# Load Initialization
genInitFactory <- function() {
i <- 0
function() {
i <<- i + 1
list(
alpha = 0.001,
beta = 0.001
)
}
}
genInit <- genInitFactory()
modelInits( bugsInits( genInit ) )
## Initializing chain 1:
## model is initialized
########################################################
burn.in = 1000 # burn the first iterations
modelUpdate( burn.in )
## 1000 updates took 0 s
# Keep track of:
samplesSet( "alpha0" )
## monitor set for variable 'alpha0'
samplesSet( "beta" )
## monitor set for variable 'beta'
samplesSet( "theta" )
## monitor set for variable 'theta'
dicSet() # evaluate Deviance Information Criterion (more info: www.mrc-bsu.cam.ac.uk/bugs/winbugs/dicpage.shtml)
## deviance set
# Run the chain:
chainLength = 5000
modelUpdate( chainLength, thin = 1 )
## 5000 updates took 0 s
########################################################
# Check results:
sample.alpha0 <- samplesSample("alpha0")
stats.alpha0 <- samplesStats("alpha0")
stats.alpha0
## mean sd MC_error val2.5pc median val97.5pc start sample
## alpha0 -1.161 0.2195 0.01139 -1.462 -1.205 -0.6183 1001 5000
sample.beta <- samplesSample("beta")
stats.beta <- samplesStats("beta")
stats.beta
## mean sd MC_error val2.5pc median val97.5pc start sample
## beta 0.888 0.1092 0.005647 0.6027 0.9197 0.997 1001 5000
# DIC
dicStats() # TODO: não entendo o que isto significa
## Dbar Dhat DIC pD
## r 293.6 292.7 294.6 0.9774
## total 293.6 292.7 294.6 0.9774
# get theta[1:N]
N=8
thetaSample = NULL
thetaSummary = NULL
for (i in 1:N) {
thetaSample <- rbind( thetaSample, samplesSample( paste0("theta[",i,"]") ) )
thetaSummary <- rbind( thetaSummary, samplesStats( paste0("theta[",i,"]") ) )
}
thetaSummary
## mean sd MC_error val2.5pc median val97.5pc start sample
## theta[1] 0.5841 0.02258 0.0010140 0.5394 0.5839 0.6296 1001 5000
## theta[2] 0.5913 0.02234 0.0009974 0.5470 0.5913 0.6365 1001 5000
## theta[3] 0.5979 0.02214 0.0009838 0.5540 0.5979 0.6427 1001 5000
## theta[4] 0.6041 0.02197 0.0009724 0.5608 0.6041 0.6485 1001 5000
## theta[5] 0.6098 0.02184 0.0009630 0.5669 0.6096 0.6538 1001 5000
## theta[6] 0.6152 0.02173 0.0009552 0.5723 0.6151 0.6592 1001 5000
## theta[7] 0.6202 0.02163 0.0009488 0.5777 0.6201 0.6645 1001 5000
## theta[8] 0.6250 0.02156 0.0009436 0.5828 0.6248 0.6686 1001 5000
samplesDensity("*", mfrow = c(3, 4))
boxplot(thetaSample, use.cols=FALSE, xlab="Dosagem de CS2", ylab="% morte")
Um conhecido em 10 afirmações disse 10 mentiras. Qual a probabilidade de dizer verdade da próxima vez?
Analiticamente, temos que, \(y\) o número de verdades em \(N\) afirmações é modelado por uma binomial de parâmetro \(\theta\), ie, \(y \sim\) binomial(\(\theta,N\)).
Escolhemos a distribuição conjugada beta com parâmetros não informativos, ie, \(\theta \sim\) beta(1,1).
Os dados \(x\) são \(y=0\) e \(N=10\). Assim, a posterior \(\theta|x\) é dada por beta(1+y,1+N-y) = beta(1,11)). O valor esperado desta beta é \(E(beta(1,11)) = \frac{1}{1+11} = 8.3\%\)
Vamos fazer o mesmo mas agora via simulação:
modelString = "
model {
# Likelihood
y ~ dbin(theta, N) # y: número de verdades
# Priors
theta ~ dbeta(1, 1) # theta: probabilidade de dizer verdade
}
"
writeLines(modelString,con="modelo_ex-a.txt")
modelCheck( "modelo_ex-a.txt" )
## model is syntactically correct
########################################################
# Load Data
dataList = list(
N = 10,
y = 0
)
modelData( bugsData( dataList ) )
## data loaded
modelCompile()
## model compiled
########################################################
# Load Initialization
modelGenInits()
## initial values generated, model initialized
########################################################
burn.in = 1000 # burn the first iterations
modelUpdate( burn.in )
## 1000 updates took 0 s
# Keep track of:
samplesSet( "theta" )
## monitor set for variable 'theta'
# Run the chain:
chainLength = 10000
modelUpdate( chainLength, thin = 10 )
## 10000 updates took 0 s
########################################################
# Check results:
sample.theta <- samplesSample("theta")
stats.theta <- samplesStats("theta")
stats.theta
## mean sd MC_error val2.5pc median val97.5pc start sample
## theta 0.08274 0.07565 0.0007628 0.002466 0.06041 0.2835 1001 10000
samplesDensity("theta", mfrow = c(1, 1))
O resultado da simulação diz-nos que há uma média de 8.27% da próxima afirmação ser verdade. Isto corresponde com a solução analítica! :-)
Vamos considerar um problema similar:
Suppose that, in a survey of 1000 people in a state, 400 say they voted in a recent primary election. Actually, though, the voter turnout was only 30%. Give an estimate of the probability that a nonvoter will falsely state that he or she voted. (Assume that all voters honestly report that they voted.) – ref
Assumindo que os 300 votantes dizem a verdade, então temos 100 mentirosos em 700 pessoas. Fazendo as mesmas contas que anteriormente: os dados \(x\) são \(y=100\) e \(N=700\). Assim, a posterior \(\theta|x\) é dada por beta(1+y,1+N-y) = beta(101,601)). O valor esperado desta beta é \(E(beta(101,601)) = \frac{101}{702} = 14.39\%\).
Via simulação:
modelString = "
model {
# Likelihood
y ~ dbin(theta, N) # y: número de verdades
# Priors
theta ~ dbeta(1, 1) # theta: probabilidade de dizer verdade
}
"
writeLines(modelString,con="modelo_ex-a.txt")
modelCheck( "modelo_ex-a.txt" )
## model is syntactically correct
########################################################
# Load Data
dataList = list(
N = 700,
y = 100
)
modelData( bugsData( dataList ) )
## data loaded
modelCompile()
## model compiled
########################################################
# Load Initialization
modelGenInits()
## initial values generated, model initialized
########################################################
burn.in = 1000 # burn the first iterations
modelUpdate( burn.in )
## 1000 updates took 0 s
# Keep track of:
samplesSet( "theta" )
## monitor set for variable 'theta'
# Run the chain:
chainLength = 10000
modelUpdate( chainLength, thin = 1 )
## 10000 updates took 0 s
########################################################
# Check results:
sample.theta <- samplesSample("theta")
stats.theta <- samplesStats("theta")
stats.theta
## mean sd MC_error val2.5pc median val97.5pc start sample
## theta 0.1439 0.01317 0.000131 0.119 0.1434 0.1708 1001 10000
samplesDensity("theta", mfrow = c(1, 1))
A média da simulação é exatamente o mesmo valor do valor esperado.
Podemos estimar qual a distribuição beta mais próxima da densidade obtida:
library(MASS)
theta.params <- fitdistr(sample.theta, "beta", list(shape1 = 1, shape2 = 1))
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
theta.params$estimate['shape1']
## shape1
## 102.2419
theta.params$estimate['shape2']
## shape2
## 608.2619
A optimização diz-nos que a distribuição beta que melhor se ajusta à cadeia é a \(beta(102.24, 608.26)\), o que está muito perto da distribuição correta \(beta(101,601)\).