library(ggplot2)
library(dplyr)
library(mgcv)
library(DT)
theme_set(theme_bw())
dt<-function(x) DT::datatable(x, 
    filter = "top",                         
    extensions = c('Buttons'), options = list(
    dom = 'Blfrtip',
    buttons = c('copy', 'csv', 'excel'), colReorder = TRUE
  ))

Introduction

This markdown document can be adapted to run a regression analysis on any pair of variables in the Arne data set. You can also use it to run analyses on sub sets of the data.

Loading the data

library(readxl)
d<- read_excel("/home/ricardol/Arne data/Arne data 2/Book1.xlsx", 
    sheet = "Sheet2")
names(d)<-c("id","Age","Ht","Circ")

Choose your data and variables

Choose which is your x and y variable

This example will look at pine density against insolation. You can change the variables in this chunk and the lablels when running any other analyses.

df<-d
df$x<-df$Circ
df$y<-df$Ht
xlabel<-"Circumference (cm)"
ylabel <-"Height (cm)"

Data summaries for individual variables

Change the name of the variable to match a numerical variable in your own data set. The command removes NAs just in case you have them

summary(df$y,na.rm=TRUE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   40.00   69.50   76.21  110.00  210.00

Confidence interval for the mean of y

confint(lm(df$y~1))
##                2.5 %   97.5 %
## (Intercept) 63.13475 89.27904

Simple boxplot of one variable

Useful for your own quick visualisation.

boxplot(df$y)

histogram of one variable

This uses ggplot. Change the binwidth if you want to use this.

g0<-ggplot(d,aes(x=df$y))
g0+geom_histogram(color="grey",binwidth = 10)+xlab(ylabel)

Regression

In this data set there are two numerical variables. So we can run a linear regresion.

Scatterplot without fitted line

g0<-ggplot(df,aes(x=x,y=y))
g0+geom_point() +xlab(xlabel) +ylab(ylabel)

Scatterplot with fitted line and labels

g0<-ggplot(df,aes(x=x,y=y))
g1<-g0+geom_point() + geom_smooth(method="lm") 
g1 + xlab(xlabel) + ylab(ylabel)

Fitting a model

Change the names of the variables in the first line.

mod<-lm(data= df, y~x)

Model summary

summary(mod)
## 
## Call:
## lm(formula = y ~ x, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.542 -10.443  -1.264  12.246  73.424 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.9516     4.5000   6.878  5.5e-09 ***
## x             7.9060     0.5673  13.937  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.73 on 56 degrees of freedom
## Multiple R-squared:  0.7762, Adjusted R-squared:  0.7722 
## F-statistic: 194.3 on 1 and 56 DF,  p-value: < 2.2e-16

Model anova table

anova(mod)
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## x          1 109359  109359  194.25 < 2.2e-16 ***
## Residuals 56  31527     563                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Confidence intervals for the model parameters

confint(mod)
##                 2.5 %    97.5 %
## (Intercept) 21.937074 39.966115
## x            6.769702  9.042392

Model diagnostics

Look at the regression handout to understand these plots.

plot(mod,which=1)

plot(mod,which=2)

plot(mod,which=3)

plot(mod,which=4)

plot(mod,which=5)

Spearman’s rank correlation

Used if all else fails. Not needed with these data, but included for reference.

g0<-ggplot(df,aes(x=rank(x),y=rank(y)))
g0+geom_point() + geom_smooth(method="lm") +xlab(paste("Rank",xlabel)) +ylab(paste("Rank",ylabel))

cor.test(df$x,df$y,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  df$x and df$y
## S = 2610.5, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9196983

Fitting a spline

Only use if you suspect that the relationship is not well described by a straight line.

library(mgcv)

g0<-ggplot(df,aes(x=x,y=y))
g1<-g0 + geom_point() + geom_smooth(method="gam", formula =y~s(x,k=3))
g1 + xlab(xlabel) + ylab(ylabel)

In this case the line is the same as the linear model. Get a summary using this code.

mod<-gam(data=df, y~s(x,k=3))
summary(mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x, k = 3)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   76.207      2.979   25.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df   F p-value    
## s(x) 1.841  1.975 114  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.792   Deviance explained = 79.9%
## GCV = 541.05  Scale est. = 514.55    n = 58

If you do use this model remember that its only needed if you can’t use linear regression. Report the ajusted R squared value, the estimated degrees of freedom and the p-value for the smooth term (not the intercept). If the spline is significant you must include the figure in your report, as that is the only way to show the shape of the response.