This file provides the main code snippets in the paper. Start by loading the relevant libraries.
knitr::opts_chunk$set(echo = TRUE)
set.seed(123)
library(knitr)
library(ggplot2)
library(xtable)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
library(magrittr)
library(lme4)
## Loading required package: Matrix
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.1.9). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## Run theme_set(theme_default()) to use the default bayesplot theme.
##
## Attaching package: 'brms'
## The following object is masked from 'package:lme4':
##
## ngrps
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.17.3, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:magrittr':
##
## extract
rstan_options(auto_write=TRUE)
options(mc.cores=parallel::detectCores())
library(loo)
## This is loo version 1.1.0
library(bayesplot)
## This is bayesplot version 1.4.0
## - Plotting theme set to bayesplot::theme_default()
## - Online documentation at mc-stan.org/bayesplot
library(stringr)
Load Mandarin data:
rawdatM <- read.delim("data/songyuan.txt", stringsAsFactors = F)
## voiceless stops -1, voiced stops +1:
datM <- rawdatM %>%
mutate(voice = ifelse(TargetConsonant %in%
c("t", "k"),-1,+1),
genderc = if_else(gender == "f",.5,-.5),
VOT = round(msVOT,0),
i = as.integer(as.factor(paste0(gender,subject))),
j = as.integer(as.factor(WorldBet))) %>%
mutate(subject = paste0(str_to_upper(gender),str_pad(i,2,pad="0"))) %>%
select(i, j, subject, item=WorldBet, genderfact= gender, gender = genderc, VOT, vduration=msVowel,voice) %>% arrange(i,j)
datM_stops <- filter(datM, voice == -1)
Load English data:
rawdatE <- read.delim("data/english.txt", stringsAsFactors = F)
## voiceless stops -1, voiced stops +1:
datE <- rawdatE %>%
mutate(voice = ifelse(TargetConsonant %in%
c("kh", "kjh", "kwh", "th", "twh"),-1,+1),
genderc = if_else(gender == "f",.5,-.5),
VOT = round(msVOT,0),
i = as.integer(as.factor(paste0(gender,subject))),
j = as.integer(as.factor(WorldBet))) %>%
mutate(subject = paste0(str_to_upper(gender),str_pad(i,2,pad="0"))) %>%
select(i, j, subject, item=WorldBet,genderfact= gender, gender = genderc, VOT, vduration=msVowel,voice) %>% arrange(i,j)
datE_stops <- filter(datE, voice == -1)
It is generally a good idea to visualize the priors in order to get a sense of whether these are reasonable.
op<-par(mfrow=c(2,3),pty="s")
par(oma = rep(0, 4), mar = c(2.7, 2.7, 0.1, 0.1), mgp = c(1.7, 0.4, 0))
b<-seq(-priors_beta0[2]*2,priors_beta0[2]*2,by=0.01)
plot(b,dnorm(b,mean=priors_beta0[1],sd=priors_beta0[2]),type="l",ylab="density",
xlab=expression(beta[0]),ylim=c(0, 0.0082))
plot(b,dnorm(b,mean=priors_beta1[1],sd=priors_beta1[2]),type="l",ylab="density",
xlab=expression(beta[1]),ylim=c(0, 0.0082))
sig<-seq(0,priors_sigma_e[2]*3,by=0.01)
plot(sig,dnorm(sig,mean=priors_sigma_e[1],sd=priors_sigma_e[2]),type="l",ylab="density",
xlab=expression(sigma[e]))
plot(sig,dnorm(sig,mean=priors_sigma_u[1],sd=priors_sigma_u[2]),type="l",ylab="density",
xlab=expression(sigma[u[0]]))
plot(sig,dnorm(sig,mean=priors_sigma_u[1],sd=priors_sigma_u[2]),type="l",ylab="density",
xlab=expression(sigma[w[0,1]]))
plot(density(corrs[[3]],bw=0.15),ylab="density",xlab=expression(rho[w]),xlim=c(-1,1),main="")
priors <- c(set_prior("normal(0, 200)", class = "Intercept"),
set_prior("normal(0, 50)", class = "b",
coef = "gender"),
set_prior("normal(0, 100)", class = "sd"),
set_prior("normal(0, 100)", class = "sigma"),
set_prior("lkj(2)", class = "cor"))
## Mandarin:
m1M <- brm(formula = VOT ~ gender + (1 | subject) + (gender | item),
data = datM_stops, family = gaussian(), prior = priors,
iter = 2000, chains = 4, control = list(adapt_delta = 0.99))
## English:
m1E <- brm(formula = VOT ~ gender + (1 | subject) + (gender | item),
data = datE_stops, family = gaussian(), prior = priors,
iter = 2000, chains = 4, control = list(adapt_delta = 0.99))
Summarize the model fit:
summary(m1M)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: VOT ~ gender + (1 | subject) + (gender | item)
## Data: datM_stops (Number of observations: 200)
## Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup samples = 4000
## ICs: LOO = NA; WAIC = NA; R2 = NA
##
## Group-Level Effects:
## ~item (Number of levels: 10)
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept) 6.66 2.31 3.22 12.21 1320 1.00
## sd(gender) 5.57 3.50 0.48 13.73 1655 1.00
## cor(Intercept,gender) -0.29 0.38 -0.87 0.52 4000 1.00
##
## ~subject (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept) 9.07 2.03 5.75 13.61 1390 1.00
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept 85.64 3.25 78.95 91.89 1255 1.00
## gender 13.27 4.87 3.77 23.12 1140 1.00
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma 14.02 0.77 12.60 15.64 4000 1.00
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
stanplot(m1M, pars = c("^b","sd","sigma"), type="hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## correlation:
stanplot(m1M, pars = c("cor"), type="hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
priors_N20 <- c(set_prior("normal(0, 200)", class = "Intercept"),
set_prior("normal(0,20)", class = "b", coef="gender"),
set_prior("normal(0, 100)", class = "sd"),
set_prior("normal(0, 100)", class = "sigma"),
set_prior("lkj(2)", class = "cor"))
m1M_N20 <- brm(formula = VOT ~ 1 + gender + (1 | subject)+(1 +gender| item),
data = datM_stops,
family = gaussian(),
prior = priors_N20,
save_all_pars = TRUE,
iter = 10000,
warmup = 2000,
chains = 4,
control = list(adapt_delta = 0.99))
priors_N20_0 <- c(set_prior("normal(0, 200)", class = "Intercept"),
#set_prior("normal(0,20)", class = "b", coef="gender"),
set_prior("normal(0, 100)", class = "sd"),
set_prior("normal(0, 100)", class = "sigma"),
set_prior("lkj(2)", class = "cor"))
m0M_N20 <- brm(formula = VOT ~ 1 + (1 | subject)+(1 +gender| item),
data = datM_stops,
family = gaussian(),
prior = priors_N20_0,
save_all_pars = TRUE,
iter = 10000,
warmup = 2000,
chains = 4,
control = list(adapt_delta = 0.99))
(BF10M_N20 <- bayes_factor(m1M_N20, m0M_N20))
## Iteration: 1
## Iteration: 2
## Iteration: 3
## Iteration: 4
## Iteration: 5
## Iteration: 6
## Iteration: 1
## Iteration: 2
## Iteration: 3
## Iteration: 4
## Iteration: 5
## The estimated Bayes factor in favor of x1 over x2 is equal to: 6.39461
pp_check(m1M, nsamples = 100)+
theme(text = element_text(size=16),legend.text=element_text(size=16))
loom1m0<-loo(m1M,m0M)
difflooic <- loom1m0$ic_diffs__
m1loo <- loom1m0$m1M
m0loo <- loom1m0$m0M
loo(m1M, m0M)
## LOOIC SE
## m1M 1653.14 22.43
## m0M 1654.79 22.60
## m1M - m0M -1.64 2.14
datMvoiced <- datM %>%
filter(voice==1) %>%
group_by(subject) %>%
summarize(meanvdur= mean(vduration),sdvdur= sd(vduration), sevdur= sd(vduration)/sqrt(length(vduration))) %>%
# mutate(cmeanvdur = scale(meanvdur, scale=FALSE))
# %>%
mutate(c_meanvdur = scale(meanvdur,scale=FALSE), cmeanvdur = scale(meanvdur), sestdvdur=sevdur/sdvdur)
meansM <- datM_stops %>% group_by(subject) %>%
summarize(meanVOT= mean(VOT), seVOT = sd(VOT)/sqrt(length(VOT))) %>% right_join(datMvoiced, by="subject")
priors_cauchy <- c(set_prior("normal(0, 200)", class = "Intercept"),
set_prior("cauchy(0,5)", class = "b",
coef = "mec_meanvdursevdur"),
set_prior("normal(0, 20)", class = "sdme"),
set_prior("normal(0, 20)", class = "sd"))
m2M_error <- brm(formula = meanVOT | se(seVOT) ~ me(c_meanvdur, sevdur) +
(1 | subject),
data = meansM, family = gaussian(), prior = priors_cauchy,
iter = 2000, chains = 4,
control = list(adapt_delta = 0.999,
max_treedepth=15))
print(m2M_error)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: meanVOT | se(seVOT) ~ me(c_meanvdur, sevdur) + (1 | subject)
## Data: meansM (Number of observations: 20)
## Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup samples = 4000
## ICs: LOO = NA; WAIC = NA; R2 = NA
##
## Group-Level Effects:
## ~subject (Number of levels: 20)
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept) 8.95 3.19 1.71 15.28 620 1.01
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept 85.80 4.78 79.39 93.44 809 1.00
## mec_meanvdursevdur 0.66 0.92 -0.24 2.91 331 1.01
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
stanplot(m2M_error,pars=c("^b","sd"), type="hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For canned models like the linear mixed models shown above, brms is great. However, if you are serious about learning Bayesian data analysis, eventually you will have to get your hands dirty with some Stan coding. An example follows.
Suppose we want to use measurement error only on the vowel durations computed by subject, but we want to keep the independent variable (VOT) unaggregated, i.e., we retain the repeated measurements from subjects. Defining such a model in Stan is easy and transparent.
This model can apparently be fit in brms too, but I (Shravan Vasishth) find this presentation below much more transparent as a modeler. I see the transparency as an advantage of Stan. Of course, a big downside of this is the start-up cost of learning Stan coding, but this is worth the effort, in my opinion.
First, we set up the Mandarin data, as an example:
datM$subject<-as.integer(as.factor(datM$subject))
datMvoiced<-subset(datM,voice==1)
## create a subject vector:
subj<-sort(unique(datMvoiced$subject))
## compute subject means and SEs for vowel duration:
means<-ses<-rep(NA,length(subj))
K<-length(subj)
for(k in 1:20){
#print(subj[i])
dat_temp<-subset(datMvoiced,subject==k)
n<-dim(dat_temp)[1]
means[k]<-mean(dat_temp$vduration)
ses[k]<-sd(dat_temp$vduration)/sqrt(n)
}
subj_vduration<-data.frame(subject=subj,
meanvdur=means,se=ses)
## needed for brms:
subj_vduration2<-subj_vduration
## divide centered means by std deviation to normalize the centered means:
subj_vduration2$cmeanvdur<-(subj_vduration$meanvdur-mean(subj_vduration$meanvdur))/sd(subj_vduration$meanvdur)
## standardize standard error:
subj_vduration2$sestd<-subj_vduration$se/sd(subj_vduration$meanvdur)
## need merged data frame for brms:
Mdatmerged<-merge(subset(datM,voice==-1),
subj_vduration2,
by.x=c("subject"),
by.y=c("subject"))
head(Mdatmerged)
## subject i j item genderfact gender VOT vduration voice meanvdur
## 1 1 1 11 ka3.che1 f 0.5 104 235.0481 -1 160.2405
## 2 1 1 12 kan4.bing4 f 0.5 102 195.2637 -1 160.2405
## 3 1 1 13 ku4.zi f 0.5 119 266.5568 -1 160.2405
## 4 1 1 14 kun4.le f 0.5 113 175.2815 -1 160.2405
## 5 1 1 15 tang2.kuai4 f 0.5 82 173.8481 -1 160.2405
## 6 1 1 16 tao1.zi f 0.5 102 351.1389 -1 160.2405
## se cmeanvdur sestd
## 1 11.63752 -0.2938542 0.6438201
## 2 11.63752 -0.2938542 0.6438201
## 3 11.63752 -0.2938542 0.6438201
## 4 11.63752 -0.2938542 0.6438201
## 5 11.63752 -0.2938542 0.6438201
## 6 11.63752 -0.2938542 0.6438201
Mdat2.stan<-list(subj=as.integer(factor(Mdatmerged$subject)),
item=as.integer(factor(Mdatmerged$item)),
gend=Mdatmerged$gender,
meanvdur=subj_vduration2$cmeanvdur,
se=subj_vduration2$sestd,
J=length(unique(Mdatmerged$subj)),
K=length(unique(Mdatmerged$item)),
N=dim(Mdatmerged)[1],
y=Mdatmerged$VOT)
The Stan model below now fits a measurement error model, on unaggregated VOT data but aggregated estimates of centered and normalized mean vowel duration along with their measurement error (standardized standard error).
m2Mmeaserrstan <- stan(file = "MeasErrVarInt.stan",
data = Mdat2.stan,
control = list(adapt_delta = 0.999,max_treedepth=12))
The model looks like this:
data {
int<lower = 1> N; //number of data points
vector[N] y; //dep variable vot
int<lower = 1> J; //number of subjects
int<lower = 1> K; //number of items
vector[J] meanvdur; // noisy centered mean vdur
vector<lower = 0>[J] se; // se mean vdur
int<lower = 1, upper = J> subj[N]; //subject id
int<lower = 1, upper = K> item[N]; //item id
}
parameters {
vector[2] beta; //fixed intercept and slopes
vector[J] true_mvdur; // true unknown value mvdur
vector[J] u; //subject intercepts
vector[K] w; //item intercepts
real<lower=0> sigma_e; //error sd
real<lower=0> sigma_u; //subj sd
real<lower=0> sigma_w; //item sd
}
model {
vector[N] mu;
//priors
true_mvdur ~ normal(0, 200);
meanvdur ~ normal(true_mvdur, se); // measurement model
beta[1] ~ normal(0, 200);
beta[2] ~ normal(0, 5);
sigma_e ~ normal(0, 20);
sigma_u ~ normal(0, 20);
sigma_w ~ normal(0, 20);
u ~ normal(0, sigma_u); //subj random effects
w ~ normal(0, sigma_w); //item random effects
// likelihood
mu = beta[1] + u[subj] + w[item] + beta[2] * true_mvdur[subj];
y ~ normal(mu, sigma_e);
}
The results can be displayed/summarized in different ways:
summary(m2Mmeaserrstan,pars=c("beta[1]","beta[2]","sigma_e","sigma_u","sigma_w"))$summary
## mean se_mean sd 2.5% 25% 50%
## beta[1] 85.477821 0.09250376 3.2420603 79.1108299 83.381874 85.503550
## beta[2] 3.608424 0.09034079 2.2076018 -0.8631694 2.167486 3.671719
## sigma_e 14.173873 0.01214384 0.7680437 12.7930987 13.633453 14.137042
## sigma_u 10.022432 0.05596693 2.3674830 5.9823669 8.382852 9.830164
## sigma_w 6.447058 0.04371849 2.2191240 3.1904121 4.913783 6.120046
## 75% 97.5% n_eff Rhat
## beta[1] 87.624640 91.68943 1228.3535 1.0007423
## beta[2] 5.126110 7.87781 597.1367 1.0035876
## sigma_e 14.695652 15.73582 4000.0000 0.9995529
## sigma_u 11.441780 15.22619 1789.4138 1.0013520
## sigma_w 7.578858 11.87848 2576.5160 1.0003564
rstan::plot(m2Mmeaserrstan,pars=c("beta[1]","beta[2]","sigma_e","sigma_u","sigma_w"),type="hist")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
It is now possible to fit a brms model that is similar to this one, but I have not tested the brms code.
What is important to learn here is the Stan code allows us to express exactly how we believe the data were generated.
Here are the packages I used to fit the models.
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 stringr_1.3.1 bayesplot_1.4.0
## [4] loo_1.1.0 rstan_2.17.3 StanHeaders_2.17.2
## [7] brms_2.1.9 Rcpp_0.12.17 lme4_1.1-17
## [10] Matrix_1.2-12 magrittr_1.5 tibble_1.4.2
## [13] dplyr_0.7.5 xtable_1.8-2 ggplot2_2.2.1
## [16] knitr_1.20
##
## loaded via a namespace (and not attached):
## [1] Brobdingnag_1.2-5 splines_3.4.4 gtools_3.5.0
## [4] threejs_0.3.1 shiny_1.0.5 assertthat_0.2.0
## [7] stats4_3.4.4 yaml_2.1.19 pillar_1.2.1
## [10] backports_1.1.2 lattice_0.20-35 glue_1.2.0
## [13] digest_0.6.15 minqa_1.2.4 colorspace_1.3-2
## [16] htmltools_0.3.6 httpuv_1.3.6.2 plyr_1.8.4
## [19] dygraphs_1.1.1.4 pkgconfig_2.0.1 purrr_0.2.5
## [22] mvtnorm_1.0-7 scales_0.5.0 DT_0.4
## [25] shinyjs_1.0 lazyeval_0.2.1 mime_0.5
## [28] evaluate_0.10.1 nlme_3.1-131.1 MASS_7.3-49
## [31] xts_0.10-2 colourpicker_1.0 rsconnect_0.8.8
## [34] tools_3.4.4 matrixStats_0.53.1 munsell_0.4.3
## [37] compiler_3.4.4 rlang_0.2.1 grid_3.4.4
## [40] nloptr_1.0.4 htmlwidgets_1.0 crosstalk_1.0.0
## [43] igraph_1.2.1 miniUI_0.1.1 labeling_0.3
## [46] base64enc_0.1-3 rmarkdown_1.9.8 codetools_0.2-15
## [49] gtable_0.2.0 inline_0.3.14 abind_1.4-5
## [52] markdown_0.8 reshape2_1.4.3 R6_2.2.2
## [55] gridExtra_2.3 rstantools_1.4.0 zoo_1.8-1
## [58] bridgesampling_0.4-0 shinythemes_1.1.1 bindr_0.1.1
## [61] shinystan_2.4.0 rprojroot_1.3-2 stringi_1.2.2
## [64] parallel_3.4.4 tidyselect_0.2.4 coda_0.19-1