d<- read_csv("Skylark Flight and Song Data.csv")
# As the IDs are compound have to separate the first letters to find the site
d %>% separate(Flight_ID, into=c("site")) ->d
# Names too long
names(d)<-c("site","Take_off","Landing","Flight_length","Song_length","x","y","Distance")
# One site seems to be mislabled as HH3
d$site<-as.factor(gsub("HH3","HH2",d$site))
write.csv(d,"skylarks.csv", row.names = FALSE)
d %>% ggplot(aes(x=site,y=Song_length)) -> g0
g0+geom_boxplot()
aqm::ci(g0)
mod<-lm(data=d, Song_length~site)
anova(mod)
## Analysis of Variance Table
##
## Response: Song_length
## Df Sum Sq Mean Sq F value Pr(>F)
## site 2 1028662 514331 21.786 9.712e-09 ***
## Residuals 114 2691357 23608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod)
##
## Call:
## lm(formula = Song_length ~ site, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -373.26 -78.26 0.68 65.68 829.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.32 30.73 3.623 0.000437 ***
## siteHH2 92.94 36.49 2.547 0.012191 *
## siteMD 261.94 41.30 6.342 4.72e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 153.7 on 114 degrees of freedom
## Multiple R-squared: 0.2765, Adjusted R-squared: 0.2638
## F-statistic: 21.79 on 2 and 114 DF, p-value: 9.712e-09
#performance::check_model(mod)
To see where the differences lie we can use pairwise comparisons.
summary(glht(mod, linfct = mcp(site = "Tukey")))
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: lm(formula = Song_length ~ site, data = d)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## HH2 - HH1 == 0 92.94 36.49 2.547 0.0321 *
## MD - HH1 == 0 261.94 41.30 6.342 <1e-04 ***
## MD - HH2 == 0 169.00 33.89 4.986 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
d %>% ggplot(aes(x=site,y=Flight_length)) -> g0
g0+geom_boxplot()
aqm::ci(g0)
mod<-lm(data=d, Flight_length~site)
anova(mod)
## Analysis of Variance Table
##
## Response: Flight_length
## Df Sum Sq Mean Sq F value Pr(>F)
## site 2 1089321 544661 20.024 3.526e-08 ***
## Residuals 114 3100886 27201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod)
##
## Call:
## lm(formula = Flight_length ~ site, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -369.19 -86.26 -4.26 58.74 1121.81
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.72 32.99 4.539 1.41e-05 ***
## siteHH2 92.54 39.17 2.363 0.0198 *
## siteMD 268.47 44.33 6.056 1.84e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 164.9 on 114 degrees of freedom
## Multiple R-squared: 0.26, Adjusted R-squared: 0.247
## F-statistic: 20.02 on 2 and 114 DF, p-value: 3.526e-08
#performance::check_model(mod)
summary(glht(mod, linfct = mcp(site = "Tukey")))
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: lm(formula = Flight_length ~ site, data = d)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## HH2 - HH1 == 0 92.54 39.17 2.363 0.0509 .
## MD - HH1 == 0 268.47 44.33 6.056 <1e-04 ***
## MD - HH2 == 0 175.93 36.38 4.836 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
d %>% filter(Distance<100) %>% ggplot(aes(x=Flight_length, y=Distance)) -> g0
g0 +geom_point() + geom_smooth(method="lm") + facet_wrap(~site)
This is tricky to analyse as the number of observations for each site is very small. I will try my best to get a result. First aggregate the data and count the number of flights for each combination of day and site.
library(lubridate)
d$day<-yday(as.Date(d$Take_off))
d %>% group_by(site,day) %>% summarise(n=n()) -> dd
Plotting this out looks like this (don’t use the plot in the results, I’m just trying to see the data here)
dd %>% ggplot(aes(x=day,y=n,colour=site)) -> g0
g0 +geom_point()
One way to test for signficance with count data is to use a generalised linear model of the poisson family.
mod<-glm(data=dd,n~site,family=poisson(link="identity"))
anova(mod,test="Chisq")
## Analysis of Deviance Table
##
## Model: poisson, link: identity
##
## Response: n
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11 55.885
## site 2 8.717 9 47.168 0.0128 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This does show overall signficance, but again the number of observations is small, so this is marginal.
summary(mod)
##
## Call:
## glm(formula = n ~ site, family = poisson(link = "identity"),
## data = dd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.15960 -2.03800 -0.07249 1.23961 2.51847
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.250 1.250 5.000 5.73e-07 ***
## siteHH2 5.950 2.001 2.974 0.00294 **
## siteMD 4.083 2.238 1.825 0.06802 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 55.885 on 11 degrees of freedom
## Residual deviance: 47.168 on 9 degrees of freedom
## AIC: 99.332
##
## Number of Fisher Scoring iterations: 3
The difference that is significant is between HH1 and HH2
summary(glht(mod, linfct = mcp(site = "Tukey")))
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: glm(formula = n ~ site, family = poisson(link = "identity"),
## data = dd)
##
## Linear Hypotheses:
## Estimate Std. Error z value Pr(>|z|)
## HH2 - HH1 == 0 5.950 2.001 2.974 0.00828 **
## MD - HH1 == 0 4.083 2.238 1.825 0.16017
## MD - HH2 == 0 -1.867 2.426 -0.770 0.72041
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)