Exercise

A researcher is interested in whether dark corrugated iron strips are used more frequently by sand lizards than plain corrugated iron. The researcher has 20 pieces of iron of each type and places them on five different sites at Arne. The strips are inspected every day for two weeks in spring. The total number of sandlizards found under each strip is recorded each day as the response variable (data may also be collected on weather conditions etc.. but you can ignore this).

Design a fake data set that could be used as the template for a simple analysis of the data using an appropriate analysis of variance. Run an analysis on the fake data.

Answer the folowing questions.

  1. What feature of the design may be considered to lead to blocking?
  2. How many levels of the factor are there?
  3. How might subsampling be handled?
  4. Which feature of the response variable may cause particular difficulties when working with real life data?

Simulating the data

set.seed(1)
id<-rep(as.factor(c(1:40)),each=14)  ## Make an identifier for each sheet of metal
site<-rep(rep(c("S1","S2","S3","S4","S5"),each=8),each=14) # Replicate the design 14 times
treat_type<-rep(rep(c("B","Z"),times=20),each=14) # Nest the treatments within site
treat_effect<-rep(rep(c(0.2,0),times=20),each=14) # Mean lizard count per sheet associated with treat
site_effect<-rep(rep(c(0.1,0.2,0.1,0.3,0.4),each=8),each=14) # Additive site effect
day<-rep(c(1:14),times=40) ## Code the 14 days of the experiment
overall_effect<-site_effect+treat_effect ## Overall effect is additive
lizard_count<-unlist(lapply(overall_effect,function(x)rpois(1,x))) ## Simulate counts from poisson
d<-data.frame(day,id,site,treat_type,lizard_count) ## Form a data frame

Aggregating

There are various options available for working with these data. The simplest (and often the best) involve aggregating the data to form variables that do not suffer from as much pseudo-replication. There are repeated measures on the same sheet, that may include the same individuals counted multiple times. The sum of observations of lizards over the 14 day period may be a reasonable measure of how attractive the sheets are.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DT)
d %>% group_by(id,site,treat_type) %>% summarise(sum=sum(lizard_count)) ->d1
datatable(d1)

Notice that using dplyr it is easy to change the level of aggregation by adding, or dropping, grouping variables.

d %>% group_by(site,treat_type) %>% summarise(sum=sum(lizard_count)) ->d2
datatable(d2)
d %>% group_by(site) %>% summarise(sum=sum(lizard_count)) ->d3
datatable(d3)
d %>% group_by(treat_type) %>% summarise(sum=sum(lizard_count)) ->d4
datatable(d4)

Simple analysis

library(ggplot2)
g0<-ggplot(d1,aes(y=sum,x=treat_type)) 
g1 <- g0 + facet_wrap(~site)
g1+geom_boxplot() 

g0+stat_summary(fun.y=mean,geom="point") + stat_summary(fun.data=mean_cl_normal,geom="errorbar")

g1 + stat_summary(fun.y=mean,geom="point") + stat_summary(fun.data=mean_cl_normal,geom="errorbar")

Model

library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
mod<-lmer(data=d1,sum~treat_type+(1|site))
anova(mod)
## Type III Analysis of Variance Table with Satterthwaite's method
##            Sum Sq Mean Sq NumDF  DenDF F value  Pr(>F)  
## treat_type     10      10     1 33.999  4.0354 0.05255 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sum ~ treat_type + (1 | site)
##    Data: d1
## 
## REML criterion at convergence: 159.4
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.19723 -0.57520 -0.00247  0.47619  2.24951 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  site     (Intercept) 4.677    2.163   
##  Residual             2.478    1.574   
## Number of obs: 40, groups:  site, 5
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)   
## (Intercept)   4.9500     1.0292  4.5131   4.809  0.00632 **
## treat_typeZ  -1.0000     0.4978 33.9991  -2.009  0.05255 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## treat_typeZ -0.242
mod<-lm(data=d1,sum~treat_type+site)
anova(mod)
## Analysis of Variance Table
## 
## Response: sum
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## treat_type  1  10.00  10.000  4.0356   0.05254 .  
## site        4 159.65  39.912 16.1071 1.721e-07 ***
## Residuals  34  84.25   2.478                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod)
## 
## Call:
## lm(formula = sum ~ treat_type + site, data = d1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6250 -0.8750  0.0000  0.8125  3.3750 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.0000     0.6097   4.921 2.18e-05 ***
## treat_typeZ  -1.0000     0.4978  -2.009   0.0525 .  
## siteS2        1.2500     0.7871   1.588   0.1215    
## siteS3       -0.1250     0.7871  -0.159   0.8748    
## siteS4        4.0000     0.7871   5.082 1.34e-05 ***
## siteS5        4.6250     0.7871   5.876 1.25e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.574 on 34 degrees of freedom
## Multiple R-squared:  0.6682, Adjusted R-squared:  0.6194 
## F-statistic: 13.69 on 5 and 34 DF,  p-value: 2.392e-07
par(mar=c(4,8,4,2)) 
plot(glht(mod, linfct = mcp(site = "Tukey")))

Comparison with fully nested random effects model

mod1<-lmer(data=d1,sum~treat_type+(1|site))
mod2<-lmer(data=d,lizard_count~treat_type+(1|site/id))
## boundary (singular) fit: see ?isSingular
anova(mod1)
## Type III Analysis of Variance Table with Satterthwaite's method
##            Sum Sq Mean Sq NumDF  DenDF F value  Pr(>F)  
## treat_type     10      10     1 33.999  4.0354 0.05255 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod2)
## Type III Analysis of Variance Table with Satterthwaite's method
##             Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## treat_type 0.71429 0.71429     1   554   2.207  0.138