Moped insurance data

Partial CASCO moped insurance from Wasa insurance, 1994-1999

moped <- read.table("http://www.karlin.mff.cuni.cz/~pesta/NMFM402/moped.txt", header=T)
head(moped)
##   class age zone duration severity number pure actual  frequency
## 1     1   1    1     62.9    18256     17 4936   2049 0.27027027
## 2     1   1    2    112.9    13632      7  845   1230 0.06200177
## 3     1   1    3    133.1    20877      9 1411    762 0.06761833
## 4     1   1    4    376.6    13045      7  242    396 0.01858736
## 5     1   1    5      9.4        0      0    0    990 0.00000000
## 6     1   1    6     70.8    15000      1  212    594 0.01412429

Data description

moped <- within(moped, {
    class <- factor(class)
    age <- factor(age)
    zone <- factor(zone)
})
summary(moped)
##  class  age    zone     duration          severity         number      
##  1:14   1:14   1:4   Min.   :   4.40   Min.   :    0   Min.   :  0.00  
##  2:14   2:14   2:4   1st Qu.:  69.67   1st Qu.: 4364   1st Qu.:  1.75  
##                3:4   Median : 177.00   Median : 6928   Median :  8.50  
##                4:4   Mean   : 666.37   Mean   : 7482   Mean   : 28.07  
##                5:4   3rd Qu.: 818.20   3rd Qu.: 8218   3rd Qu.: 45.25  
##                6:4   Max.   :5505.30   Max.   :20877   Max.   :136.00  
##                7:4                                                     
##       pure            actual         frequency      
##  Min.   :   0.0   Min.   : 119.0   Min.   :0.00000  
##  1st Qu.: 114.5   1st Qu.: 238.0   1st Qu.:0.01581  
##  Median : 211.5   Median : 396.0   Median :0.03334  
##  Mean   : 608.0   Mean   : 550.0   Mean   :0.06451  
##  3rd Qu.: 668.0   3rd Qu.: 645.8   3rd Qu.:0.07782  
##  Max.   :4936.0   Max.   :2049.0   Max.   :0.27027  
## 
rel <-
    data.frame(rating.factor =
               c(rep("Vehicle class", nlevels(moped$class)),
                 rep("Vehicle age",   nlevels(moped$age)),
                 rep("Zone",          nlevels(moped$zone))),
               class =
               c(levels(moped$class),
                 levels(moped$age),
                 levels(moped$zone)),
               stringsAsFactors = FALSE)

We next calculate the duration and number of claims for each level of each rating factor. We also set the contrasts for the levels. The foreach package is convenient to use here, but you can of course do it with a normal loop and a couple of variables.

library("foreach")

## Calculate duration per rating factor level and also set the
## contrasts. We use foreach here to execute the loop both for its
## side-effect (setting the contrasts) and to accumulate the sums.
new.cols <- 
  foreach (rating.factor = c("class", "age", "zone"),
           .combine = rbind) %do%
{
    nclaims <- tapply(moped$number, moped[[rating.factor]], sum)
    sums <- tapply(moped$duration, moped[[rating.factor]], sum)
    n.levels <- nlevels(moped[[rating.factor]])
    contrasts(moped[[rating.factor]]) <-
        contr.treatment(n.levels)[rank(-sums, ties.method = "first"), ]
    data.frame(dur = sums, n.claims = nclaims)
}
rel <- cbind(rel, new.cols)
rm(new.cols)
print(rel)
##    rating.factor class     dur n.claims
## 1  Vehicle class     1  9833.2      391
## 2  Vehicle class     2  8825.1      395
## 11   Vehicle age     1  1918.4      141
## 21   Vehicle age     2 16739.9      645
## 12          Zone     1  1451.4      206
## 22          Zone     2  2486.3      209
## 3           Zone     3  2888.7      132
## 4           Zone     4 10069.1      207
## 5           Zone     5   246.1        6
## 6           Zone     6  1369.2       23
## 7           Zone     7   147.5        3

Please note: The purpose of this page is to show how to use various data analysis commands. It does not cover all aspects of the research process which actuaries are expected to do.

Number of claims (Frequency)

Exploring the number of claims

library(ggplot2)

ggplot(moped, aes(number, fill = class)) + geom_histogram(binwidth = 1) + facet_grid(class ~ ., margins = TRUE, scales = "free")

plot of chunk hist-number-class

ggplot(moped, aes(number, fill = age)) + geom_histogram(binwidth = 1) + facet_grid(age ~ ., margins = TRUE, scales = "free")

plot of chunk hist-number-age

ggplot(moped, aes(number, fill = zone)) + geom_histogram(binwidth = 1) + facet_grid(zone ~ ., margins = TRUE, scales = "free")

plot of chunk hist-number-zone

Poisson regression with log link

summary(model.frequency <- glm(number ~ class + age + zone + offset(log(duration)), data = moped, family = poisson))
## 
## Call:
## glm(formula = number ~ class + age + zone + offset(log(duration)), 
##     family = poisson, data = moped)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5001  -0.8712  -0.3153   0.8260   1.5251  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.829639   0.074997 -51.064  < 2e-16 ***
## class1      -0.252640   0.073777  -3.424 0.000616 ***
## age1         0.437661   0.093954   4.658 3.19e-06 ***
## zone2        0.802747   0.111493   7.200 6.02e-13 ***
## zone3        1.428190   0.099375  14.372  < 2e-16 ***
## zone4        1.959875   0.101451  19.319  < 2e-16 ***
## zone5       -0.231218   0.219861  -1.052 0.292958    
## zone6        0.185408   0.414164   0.448 0.654393    
## zone7        0.000554   0.581627   0.001 0.999240    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 520.352  on 27  degrees of freedom
## Residual deviance:  30.077  on 19  degrees of freedom
## AIC: 157.34
## 
## Number of Fisher Scoring iterations: 5

Note here the use of the offset() term as opposed to the weights= argument. The offset is log(duration) because our link function is log.

Recall that the offset is just a predictor variable whose coefficient is fixed at 1. So, using the standard setup for a Poisson regression with a log link, we have: \[ \log\mathsf{E}Y_i={\bf x}_i^{\top}{\boldsymbol\beta}+\log\omega_i \] where \(\omega_i\) is the offset/exposure variable. This can be rewritten as \[ \log\mathsf{E}[Y_i/\omega_i]={\bf x}_i^{\top}{\boldsymbol\beta} \] Your underlying random variable is still \(Y_i\), but by dividing by \(\omega_i\) we've converted the LHS of the model equation to be a rate of events per unit exposure. But this division also alters the variance of the response, so we have to weight by \(\omega_i\) when fitting the model.

# use quasipoisson family to stop glm complaining about nonintegral response
summary(glm(number/duration ~ class + age + zone, family=quasipoisson, data=moped, weights=duration))
## 
## Call:
## glm(formula = number/duration ~ class + age + zone, family = quasipoisson, 
##     data = moped, weights = duration)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5001  -0.8712  -0.3153   0.8260   1.5251  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.829639   0.094807 -40.394  < 2e-16 ***
## class1      -0.252640   0.093264  -2.709  0.01392 *  
## age1         0.437661   0.118771   3.685  0.00157 ** 
## zone2        0.802747   0.140943   5.696 1.72e-05 ***
## zone3        1.428190   0.125624  11.369 6.42e-10 ***
## zone4        1.959875   0.128248  15.282 3.96e-12 ***
## zone5       -0.231218   0.277934  -0.832  0.41579    
## zone6        0.185408   0.523561   0.354  0.72714    
## zone7        0.000554   0.735258   0.001  0.99941    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 1.598048)
## 
##     Null deviance: 520.352  on 27  degrees of freedom
## Residual deviance:  30.077  on 19  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

The information on deviance is also provided. We can use the residual deviance to perform a goodness-of-fit test for the overall model. The residual deviance is the difference between the deviance of the current model and the maximum deviance of the ideal model where the predicted values are identical to the observed. Therefore, if the residual difference is small enough, the goodness of fit test will not be significant, indicating that the model fits the data. We conclude that the model fits reasonably well because the goodness-of-fit chi-squared test is not statistically significant. If the test had been statistically significant, it would indicate that the data do not fit the model well. In that situation, we may try to determine if there are omitted predictor variables, if our linearity assumption holds and/or if there is an issue of over-dispersion.

with(model.frequency, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE)))
##      res.deviance df          p
## [1,]     30.07667 19 0.05083071

We can also test the overall effect of zone by comparing the deviance of the full model with the deviance of the model excluding zone. The six degree-of-freedom chi-square test indicates that zone, taken together, is a statistically significant predictor of number.

## update model.frequency model dropping zone
model.f2 <- update(model.frequency, . ~ . - zone)
## test model differences with chi square test
anova(model.f2, model.frequency, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: number ~ class + age + offset(log(duration))
## Model 2: number ~ class + age + zone + offset(log(duration))
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        25     477.34                          
## 2        19      30.08  6   447.27 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Diagnostics

Anscombe residuals

library(wle)

res_ans <- residualsAnscombe(moped$number, mu=model.frequency$fitted.values, family = poisson())
qqnorm(res_ans)
qqline(res_ans)

plot of chunk res-freq-ans

Graphical prediction

newdata1 <- expand.grid(1:7, factor(1:2), 1:2)
colnames(newdata1) <- c("zone", "class", "age")
#newdata1 <- subset(newdata1, subset = (zone <= age))
newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1]))
colnames(newdata1)[4] <- "duration"
newdata1 <- within(newdata1, {
    age <- factor(age)
    zone <- factor(zone)
})
newdata1$phat <- predict(model.frequency, newdata1)

ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted number of claims")

plot of chunk pred-freq

Prediction of (relative) frequency for each rating factor.

rels <- coef( model.frequency )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )
rel$rels.frequency <-
    c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")],
      c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")],
      c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])

Negative binomial regression with log link

library(MASS)

summary(model.NB <- glm.nb(number ~ class + age + zone + offset(log(duration)), data = moped, control = glm.control(maxit = 25, trace = T)))
## Theta(1) = 474.742000, 2(Ls - Lm) = 28.390200
## Theta(2) = 323.492000, 2(Ls - Lm) = 27.719000
## Theta(3) = 323.492000, 2(Ls - Lm) = 27.711700
## Theta(4) = 284.807000, 2(Ls - Lm) = 27.440900
## Theta(5) = 284.808000, 2(Ls - Lm) = 27.439700
## Theta(6) = 271.483000, 2(Ls - Lm) = 27.331000
## Theta(7) = 271.483000, 2(Ls - Lm) = 27.330800
## Theta(8) = 266.449000, 2(Ls - Lm) = 27.287300
## Theta(9) = 266.446000, 2(Ls - Lm) = 27.287200
## Theta(10) = 264.477000, 2(Ls - Lm) = 27.269800
## Theta(11) = 264.475000, 2(Ls - Lm) = 27.269800
## Theta(12) = 263.695000, 2(Ls - Lm) = 27.262900
## Theta(13) = 263.693000, 2(Ls - Lm) = 27.262800
## Theta(14) = 263.382000, 2(Ls - Lm) = 27.260100
## Theta(15) = 263.382000, 2(Ls - Lm) = 27.260100
## Theta(16) = 263.256000, 2(Ls - Lm) = 27.258900
## Theta(17) = 263.257000, 2(Ls - Lm) = 27.258900
## Theta(18) = 263.206000, 2(Ls - Lm) = 27.258500
## Theta(19) = 263.208000, 2(Ls - Lm) = 27.258500
## Theta(20) = 263.187000, 2(Ls - Lm) = 27.258300
## Theta(21) = 263.188000, 2(Ls - Lm) = 27.258300
## Theta(22) = 263.179000, 2(Ls - Lm) = 27.258200
## Theta(23) = 263.180000, 2(Ls - Lm) = 27.258200
## Theta(24) = 263.176000, 2(Ls - Lm) = 27.258200
## Theta(25) = 263.176000, 2(Ls - Lm) = 27.258200
## Warning in glm.nb(number ~ class + age + zone + offset(log(duration)),
## data = moped, : alternation limit reached
## 
## Call:
## glm.nb(formula = number ~ class + age + zone + offset(log(duration)), 
##     data = moped, control = glm.control(maxit = 25, trace = T), 
##     init.theta = 263.1764465, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3858  -0.7748  -0.2810   0.7147   1.4625  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.85536    0.08785 -43.886  < 2e-16 ***
## class1      -0.23859    0.08190  -2.913  0.00358 ** 
## age1         0.43126    0.09957   4.331 1.48e-05 ***
## zone2        0.82014    0.12467   6.578 4.76e-11 ***
## zone3        1.44358    0.11310  12.764  < 2e-16 ***
## zone4        1.98693    0.11444  17.362  < 2e-16 ***
## zone5       -0.20870    0.22728  -0.918  0.35849    
## zone6        0.20529    0.41803   0.491  0.62336    
## zone7        0.02115    0.58396   0.036  0.97110    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(263.1762) family taken to be 1)
## 
##     Null deviance: 429.738  on 27  degrees of freedom
## Residual deviance:  27.258  on 19  degrees of freedom
## AIC: 159.17
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  263 
##           Std. Err.:  561 
## Warning while fitting theta: alternation limit reached 
## 
##  2 x log-likelihood:  -139.166

Negative binomial models assume the conditional means are not equal to the conditional variances. This inequality is captured by estimating a dispersion parameter (not shown in the output) that is held constant in a Poisson model. Thus, the Poisson model is actually nested in the negative binomial model. We can then use a likelihood ratio test to compare these two and test this model assumption. To do this, we will run our model as a Poisson.

(diff2loglik <- as.numeric(2 * (logLik(model.NB) - logLik(model.frequency))))
## [1] 0.17505
pchisq(diff2loglik, df = 1, lower.tail = FALSE)
## [1] 0.6756621

In this example the associated chi-squared value is 0.17505 with one degree of freedom. This strongly suggests the Poisson model, without the dispersion parameter, is more appropriate than the negative binomial model (blindly ignoring the warning of number of iterations).

Claim amount (Severity)

ggplot(moped, aes(severity)) + geom_histogram() + scale_x_log10() + facet_grid(class ~ age, margins = TRUE, scales = "free_y")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk hist-sev

ggplot(moped, aes(class, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = class, y = severity, group = 1), method = "loess")
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).

plot of chunk violin-sev-class

ggplot(moped, aes(age, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = age, y = severity, group = 1), method = "loess")
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).

plot of chunk violin-sev-age

ggplot(moped, aes(zone, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = zone, y = severity, group = 1), method = "loess")
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).

plot of chunk violin-sev-zone

There are a couple of points to note here:

  1. We will model using a Gamma distribution for the errors. Note the point that this is only one of several plausible candidate distributions.
  2. Because we are using the Gamma distribution we need to remove the zero values from the data; we do this using the moped[moped$severity > 0, ] construct.
  3. We use the non-canonical log link function even though the canonical function (inverse) gives a slightly better fit (residual deviance 5.9 versus 8.0 on 16 degrees of freedom). The reason is the interpretability.
summary(model.severity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("log"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma("log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.55662  -0.25644   0.01745   0.32310   1.42683  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.85756    0.05301 167.089  < 2e-16 ***
## class1      -0.60677    0.05494 -11.044 6.78e-09 ***
## age1         0.58397    0.06943   8.411 2.88e-07 ***
## zone2        0.06416    0.08066   0.795   0.4380    
## zone3        0.07206    0.07328   0.983   0.3401    
## zone4        0.19400    0.07472   2.596   0.0195 *  
## zone5       -0.02100    0.15890  -0.132   0.8965    
## zone6        0.19151    0.29983   0.639   0.5320    
## zone7        0.18126    0.42039   0.431   0.6721    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.521651)
## 
##     Null deviance: 109.7707  on 24  degrees of freedom
## Residual deviance:   7.9998  on 16  degrees of freedom
## AIC: 12377
## 
## Number of Fisher Scoring iterations: 5
(sev.resid <- as.vector(residuals(model.severity)))
##  [1]  0.75073118  0.01745143  1.42683169  0.09215516  0.20184252
##  [6] -0.57298737 -0.25643766 -0.14864016 -0.20377173 -0.17560413
## [11]  0.39737962 -0.51922734 -0.24852120 -0.47166985 -0.36223425
## [16] -1.55662151  0.52134777 -0.24190563  0.16008063  0.32310306
## [21]  0.18797378  0.02994634 -0.32529267  0.81065248  0.58028815

Some diagnostics

diagnostics <- as.data.frame(cbind(sev.resid,moped$severity[moped$severity > 0]))
colnames(diagnostics) <- c("residuals","observed")
qplot(observed, residuals, size = I(4), data = diagnostics)

plot of chunk diag-sev

qplot(observed, residuals^2, size = I(4), data = diagnostics)

plot of chunk diag-sev
Residuals on the scale of the response, \(y - \hat{y}\); in a binary logistic regression, \(y\) is 0 or 1 and \(\hat{y}\) is the fitted probability of a 1. As it turns out, response residuals aren't terribly useful for a logit model. More useful for a continuous response.

Pearson residuals

res_pearson <- residuals(model.severity, "pearson") 
qplot(diagnostics$observed, res_pearson, size = I(4), xlab="Y", ylab="Pearson residuals")

plot of chunk res-pearson
Components of the Pearson goodness-of-fit statistic.

Deviance residuals

res_deviance <- residuals(model.severity, "deviance")
qplot(diagnostics$observed, res_deviance, size = I(4), xlab="Y", ylab="Deviance residuals")

plot of chunk res-deviance
Components of the residual deviance for the model.

Working residuals

res_working <- residuals(model.severity, "working")
qplot(diagnostics$observed, res_working, size = I(4), xlab="Y", ylab="Working residuals")

plot of chunk res-working
Residuals from the final weighted-least-squares regression of the IWLS procedure used to fit the model; useful, for example, for detecting nonlinearity.

Anscombe residuals

res_anscombe <- residualsAnscombe(moped$severity[moped$severity > 0], mu=model.severity$fitted, Gamma("log"))
qplot(diagnostics$observed, res_anscombe, size = I(4), xlab="Y", ylab="Anscombe residuals")

plot of chunk res-anscombe

qqnorm(res_anscombe)
qqline(res_anscombe)

plot of chunk res-anscombe

Prediction

newdata1 <- expand.grid(1:7, factor(1:2), 1:2)
colnames(newdata1) <- c("zone", "class", "age")
#newdata1 <- subset(newdata1, subset = (zone <= age))
newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1]))
colnames(newdata1)[4] <- "duration"
newdata1 <- within(newdata1, {
    age <- factor(age)
    zone <- factor(zone)
})
newdata1$phat <- predict(model.severity, newdata1)

ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted claim amount")

plot of chunk pred-sev

rels <- coef( model.severity )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )
## Aside: For the canonical link function use
## rels <- rels[1] / (rels[1] + rels[-1])

rel$rels.severity <-
    c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")],
      c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")],
      c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])

Other posibilities GoF test.

with(model.severity, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE)))
##      res.deviance df         p
## [1,]      7.99982 16 0.9488717

Submodel testing.

## update model.severity model dropping zone
model.s2 <- update(model.severity, . ~ . - zone)
## test model differences with chi square test
anova(model.s2, model.severity, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: severity ~ class + age
## Model 2: severity ~ class + age + zone
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        22    12.0631                     
## 2        16     7.9998  6   4.0633   0.2539
## simplifued severity model
summary(model.s2)
## 
## Call:
## glm(formula = severity ~ class + age, family = Gamma("log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7323  -0.4808  -0.0401   0.5229   1.3237  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.92016    0.03851 231.651  < 2e-16 ***
## class1      -0.57354    0.05421 -10.581 4.28e-10 ***
## age1         0.61521    0.07064   8.709 1.40e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.5572562)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:  12.063  on 22  degrees of freedom
## AIC: 12688
## 
## Number of Fisher Scoring iterations: 5

Gamma with inverse link (canonical)

summary(model.sGi <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma, weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma, 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.28467  -0.32335   0.03316   0.30575   0.83049  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.465e-04  6.641e-06  22.052 2.11e-13 ***
## class1       9.039e-05  7.091e-06  12.747 8.54e-10 ***
## age1        -7.577e-05  6.483e-06 -11.688 3.01e-09 ***
## zone2       -1.092e-05  9.549e-06  -1.143    0.270    
## zone3       -7.902e-06  9.432e-06  -0.838    0.414    
## zone4       -2.217e-05  8.737e-06  -2.537    0.022 *  
## zone5        6.993e-06  2.091e-05   0.334    0.742    
## zone6       -3.576e-05  3.845e-05  -0.930    0.366    
## zone7        1.255e-05  4.395e-05   0.286    0.779    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.3756071)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:   5.877  on 16  degrees of freedom
## AIC: 12134
## 
## Number of Fisher Scoring iterations: 5

Gamma with inverse squared link

summary(model.sG2 <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("1/mu^2"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma("1/mu^2"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.41879  -0.35551   0.04919   0.31231   1.57948  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.170e-08  2.567e-09   8.453 2.69e-07 ***
## class1       2.400e-08  3.459e-09   6.937 3.34e-06 ***
## age1        -1.600e-08  2.160e-09  -7.407 1.48e-06 ***
## zone2       -3.086e-09  2.907e-09  -1.062    0.304    
## zone3       -1.362e-09  3.301e-09  -0.413    0.685    
## zone4       -3.275e-09  2.801e-09  -1.169    0.259    
## zone5        1.428e-09  8.081e-09   0.177    0.862    
## zone6       -1.071e-08  1.266e-08  -0.846    0.410    
## zone7        4.780e-09  1.691e-08   0.283    0.781    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.7625844)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:  11.806  on 16  degrees of freedom
## AIC: 12683
## 
## Number of Fisher Scoring iterations: 8

Gamma with identity link

summary(model.sGidentity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("identity"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma("identity"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.75903  -0.44915  -0.02916   0.31961   2.03153  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7202.2      495.1  14.546 1.21e-10 ***
## class1       -3483.6      489.4  -7.119 2.44e-06 ***
## age1          3790.1      807.6   4.693 0.000244 ***
## zone2          366.7      664.0   0.552 0.588394    
## zone3          419.1      557.8   0.751 0.463315    
## zone4         1063.2      605.5   1.756 0.098255 .  
## zone5          217.4     1358.5   0.160 0.874887    
## zone6          710.5     2171.6   0.327 0.747773    
## zone7         2036.1     4751.9   0.428 0.674011    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.071622)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:  14.789  on 16  degrees of freedom
## AIC: 12861
## 
## Number of Fisher Scoring iterations: 7

Normal with identity link (canonical)

summary(model.sNcan <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "identity"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "identity"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -13626.0   -3992.1     951.4    3346.5   24341.4  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7312.60     740.85   9.871 3.29e-08 ***
## class1      -4252.82     767.80  -5.539 4.49e-05 ***
## age1         4817.84     970.35   4.965  0.00014 ***
## zone2         632.76    1127.29   0.561  0.58236    
## zone3         496.16    1024.12   0.484  0.63462    
## zone4        1473.01    1044.22   1.411  0.17750    
## zone5        -476.89    2220.75  -0.215  0.83268    
## zone6        1849.61    4190.30   0.441  0.66483    
## zone7         -63.27    5875.10  -0.011  0.99154    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 101884490)
## 
##     Null deviance: 6820988245  on 24  degrees of freedom
## Residual deviance: 1630151847  on 16  degrees of freedom
## AIC: 478.47
## 
## Number of Fisher Scoring iterations: 2

Normal with log link

summary(model.sNlog <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "log"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -11677.3   -2257.6     220.6    3476.8   13652.1  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.84907    0.06613 133.818  < 2e-16 ***
## class1      -0.72303    0.07300  -9.904 3.14e-08 ***
## age1         0.71334    0.06569  10.859 8.63e-09 ***
## zone2        0.13812    0.09353   1.477   0.1592    
## zone3        0.05901    0.09522   0.620   0.5442    
## zone4        0.20928    0.08801   2.378   0.0302 *  
## zone5       -0.07315    0.21110  -0.347   0.7335    
## zone6        0.37682    0.35890   1.050   0.3094    
## zone7       -0.34514    0.54065  -0.638   0.5323    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 37687021)
## 
##     Null deviance: 6.821e+09  on 24  degrees of freedom
## Residual deviance: 6.030e+08  on 16  degrees of freedom
## AIC: 453.61
## 
## Number of Fisher Scoring iterations: 5

Normal with inverse link

summary(model.sNi <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "inverse"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "inverse"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -9454.8  -2351.3    798.9   2569.8   5419.4  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.476e-04  6.131e-06  24.068 5.41e-14 ***
## class1       8.657e-05  7.699e-06  11.245 5.24e-09 ***
## age1        -7.535e-05  4.945e-06 -15.237 6.04e-11 ***
## zone2       -2.090e-05  6.774e-06  -3.086  0.00709 ** 
## zone3       -4.753e-06  8.009e-06  -0.593  0.56118    
## zone4       -1.887e-05  6.635e-06  -2.844  0.01171 *  
## zone5        3.994e-06  1.811e-05   0.221  0.82817    
## zone6       -5.031e-05  2.732e-05  -1.841  0.08417 .  
## zone7        3.789e-05  4.875e-05   0.777  0.44839    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18337492)
## 
##     Null deviance: 6.821e+09  on 24  degrees of freedom
## Residual deviance: 2.934e+08  on 16  degrees of freedom
## AIC: 435.6
## 
## Number of Fisher Scoring iterations: 7

Normal with inverse squared link

summary(model.sNis <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "1/mu^2"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "1/mu^2"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7176.4  -2421.2    -33.8   1763.1   9398.1  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.119e-08  1.803e-09  11.753 2.78e-09 ***
## class1       2.212e-08  3.286e-09   6.730 4.83e-06 ***
## age1        -1.533e-08  1.550e-09  -9.891 3.20e-08 ***
## zone2       -3.526e-09  1.539e-09  -2.290   0.0359 *  
## zone3       -8.443e-10  1.912e-09  -0.442   0.6647    
## zone4       -2.954e-09  1.543e-09  -1.914   0.0736 .  
## zone5       -1.576e-10  4.062e-09  -0.039   0.9695    
## zone6       -1.228e-08  6.148e-09  -1.997   0.0631 .  
## zone7        7.474e-09  1.480e-08   0.505   0.6205    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 25575956)
## 
##     Null deviance: 6820988245  on 24  degrees of freedom
## Residual deviance:  409209544  on 16  degrees of freedom
## AIC: 443.91
## 
## Number of Fisher Scoring iterations: 8

Inverse gaussian with inverse squared link (canonical)

summary(model.sIis <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = inverse.gaussian("1/mu^2"), weights = number))
## Warning in sqrt(eta): NaNs produced
## Error: no valid set of coefficients has been found: please supply starting values

Inverse gaussian with log link

summary(model.sIlog <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = inverse.gaussian("log"), weights = number))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = inverse.gaussian("log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -0.0189284  -0.0032597  -0.0008518   0.0022828   0.0133261  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.85075    0.04654 190.177  < 2e-16 ***
## class1      -0.58491    0.04720 -12.392 1.29e-09 ***
## age1         0.54405    0.06726   8.089 4.80e-07 ***
## zone2        0.05579    0.06830   0.817  0.42599    
## zone3        0.07635    0.06006   1.271  0.22181    
## zone4        0.19083    0.06302   3.028  0.00799 ** 
## zone5        0.03402    0.13641   0.249  0.80622    
## zone6        0.12829    0.23617   0.543  0.59446    
## zone7        0.37604    0.43484   0.865  0.39994    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 6.217246e-05)
## 
##     Null deviance: 0.0158876  on 24  degrees of freedom
## Residual deviance: 0.0009851  on 16  degrees of freedom
## AIC: 12165
## 
## Number of Fisher Scoring iterations: 5

Etc. Compare the AIC and plot the residuals.

Tweedie as compound Poisson-gamma distribution

library(tweedie)
# With exact zeros, Tweedie index xi must be between 1 and 2
# Fit the tweedie distribution
out <- tweedie.profile(severity ~ 1, xi.vec=seq(1.01, 1.99, length=9), do.plot=TRUE, data = moped)
## 1.01 1.1325 1.255 1.3775 1.5 1.6225 1.745 1.8675 1.99 
## .........Done.

plot of chunk unnamed-chunk-31

out$xi.max
## [1] 1.15
# Plot this distribution
tweedie.plot(seq(0, max(moped$severity), length=1000), mu=mean(moped$severity), xi=out$xi.max, phi=out$phi.max)

plot of chunk unnamed-chunk-31

# Fit the glm
library(statmod) # Provides tweedie family functions
summary(model.stweedie <- glm(severity ~ class + age + zone, family = tweedie(var.power=out$xi.max, link.power=0), data=moped))
## 
## Call:
## glm(formula = severity ~ class + age + zone, family = tweedie(var.power = out$xi.max, 
##     link.power = 0), data = moped)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -61.631  -13.654   -1.007    7.737   43.658  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.1395475  0.3108972  29.397  < 2e-16 ***
## class1      -0.7454037  0.2257546  -3.302  0.00375 ** 
## age1         0.2899811  0.2165155   1.339  0.19627    
## zone2        0.1221210  0.3795417   0.322  0.75115    
## zone3        0.0043121  0.3889017   0.011  0.99127    
## zone4        0.1897670  0.3744810   0.507  0.61816    
## zone5       -0.0008962  0.3893320  -0.002  0.99819    
## zone6       -0.7393180  0.4666736  -1.584  0.12964    
## zone7       -0.3506001  0.4216913  -0.831  0.41607    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Tweedie family taken to be 625.2288)
## 
##     Null deviance: 27620  on 27  degrees of freedom
## Residual deviance: 15084  on 19  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 7

Premium: Combining the models

Now it is trivial to combine and display the results.

rel$rels.pure.premium <- with(rel, rels.frequency * rels.severity)
print(rel, digits = 2)
##    rating.factor class   dur n.claims rels.frequency rels.severity
## 1  Vehicle class     1  9833      391           1.00          1.00
## 2  Vehicle class     2  8825      395           0.78          0.55
## 11   Vehicle age     1  1918      141           1.55          1.79
## 21   Vehicle age     2 16740      645           1.00          1.00
## 12          Zone     1  1451      206           7.10          1.21
## 22          Zone     2  2486      209           4.17          1.07
## 3           Zone     3  2889      132           2.23          1.07
## 4           Zone     4 10069      207           1.00          1.00
## 5           Zone     5   246        6           1.20          1.21
## 6           Zone     6  1369       23           0.79          0.98
## 7           Zone     7   148        3           1.00          1.20
##    rels.pure.premium
## 1               1.00
## 2               0.42
## 11              2.78
## 21              1.00
## 12              8.62
## 22              4.48
## 3               2.38
## 4               1.00
## 5               1.46
## 6               0.78
## 7               1.20