+/-

moped <- read.table("http://www.karlin.mff.cuni.cz/~pesta/NMFM402/moped.txt", header=T)
head(moped)

##   class age zone duration severity number pure actual  frequency
## 1     1   1    1     62.9    18256     17 4936   2049 0.27027027
## 2     1   1    2    112.9    13632      7  845   1230 0.06200177
## 3     1   1    3    133.1    20877      9 1411    762 0.06761833
## 4     1   1    4    376.6    13045      7  242    396 0.01858736
## 5     1   1    5      9.4        0      0    0    990 0.00000000
## 6     1   1    6     70.8    15000      1  212    594 0.01412429

Data description

moped <- within(moped, {
    class <- factor(class)
    age <- factor(age)
    zone <- factor(zone)
})
summary(moped)

##  class  age    zone     duration          severity         number      
##  1:14   1:14   1:4   Min.   :   4.40   Min.   :    0   Min.   :  0.00  
##  2:14   2:14   2:4   1st Qu.:  69.67   1st Qu.: 4364   1st Qu.:  1.75  
##                3:4   Median : 177.00   Median : 6928   Median :  8.50  
##                4:4   Mean   : 666.37   Mean   : 7482   Mean   : 28.07  
##                5:4   3rd Qu.: 818.20   3rd Qu.: 8218   3rd Qu.: 45.25  
##                6:4   Max.   :5505.30   Max.   :20877   Max.   :136.00  
##                7:4                                                     
##       pure            actual         frequency      
##  Min.   :   0.0   Min.   : 119.0   Min.   :0.00000  
##  1st Qu.: 114.5   1st Qu.: 238.0   1st Qu.:0.01581  
##  Median : 211.5   Median : 396.0   Median :0.03334  
##  Mean   : 608.0   Mean   : 550.0   Mean   :0.06451  
##  3rd Qu.: 668.0   3rd Qu.: 645.8   3rd Qu.:0.07782  
##  Max.   :4936.0   Max.   :2049.0   Max.   :0.27027  
##

rel <-
    data.frame(rating.factor =
               c(rep("Vehicle class", nlevels(moped$class)),
                 rep("Vehicle age",   nlevels(moped$age)),
                 rep("Zone",          nlevels(moped$zone))),
               class =
               c(levels(moped$class),
                 levels(moped$age),
                 levels(moped$zone)),
               stringsAsFactors = FALSE)

## Calculate duration per rating factor level and also set the
## contrasts. We use foreach here to execute the loop both for its
## side-effect (setting the contrasts) and to accumulate the sums.
new.cols <- 
  foreach (rating.factor = c("class", "age", "zone"),
           .combine = rbind) %do%
{
    nclaims <- tapply(moped$number, moped[[rating.factor]], sum)
    sums <- tapply(moped$duration, moped[[rating.factor]], sum)
    n.levels <- nlevels(moped[[rating.factor]])
    contrasts(moped[[rating.factor]]) <-
        contr.treatment(n.levels)[rank(-sums, ties.method = "first"), ]
    data.frame(dur = sums, n.claims = nclaims)
}
rel <- cbind(rel, new.cols)
rm(new.cols)
print(rel)

##    rating.factor class     dur n.claims
## 1  Vehicle class     1  9833.2      391
## 2  Vehicle class     2  8825.1      395
## 11   Vehicle age     1  1918.4      141
## 21   Vehicle age     2 16739.9      645
## 12          Zone     1  1451.4      206
## 22          Zone     2  2486.3      209
## 3           Zone     3  2888.7      132
## 4           Zone     4 10069.1      207
## 5           Zone     5   246.1        6
## 6           Zone     6  1369.2       23
## 7           Zone     7   147.5        3

Please note: The purpose of this page is to show how to use various data analysis commands. It does not cover all aspects of the research process which actuaries are expected to do.

Number of claims (Frequency)

Exploring the number of claims

ggplot(moped, aes(number, fill = class)) + geom_histogram(binwidth = 1) + facet_grid(class ~ ., margins = TRUE, scales = "free")

ggplot(moped, aes(number, fill = age)) + geom_histogram(binwidth = 1) + facet_grid(age ~ ., margins = TRUE, scales = "free")

ggplot(moped, aes(number, fill = zone)) + geom_histogram(binwidth = 1) + facet_grid(zone ~ ., margins = TRUE, scales = "free")

Poisson regression with log link

summary(model.frequency <- glm(number ~ class + age + zone + offset(log(duration)), data = moped, family = poisson))

## 
## Call:
## glm(formula = number ~ class + age + zone + offset(log(duration)), 
##     family = poisson, data = moped)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5001  -0.8712  -0.3153   0.8260   1.5251  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.829639   0.074997 -51.064  < 2e-16 ***
## class1      -0.252640   0.073777  -3.424 0.000616 ***
## age1         0.437661   0.093954   4.658 3.19e-06 ***
## zone2        0.802747   0.111493   7.200 6.02e-13 ***
## zone3        1.428190   0.099375  14.372  < 2e-16 ***
## zone4        1.959875   0.101451  19.319  < 2e-16 ***
## zone5       -0.231218   0.219861  -1.052 0.292958    
## zone6        0.185408   0.414164   0.448 0.654393    
## zone7        0.000554   0.581627   0.001 0.999240    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 520.352  on 27  degrees of freedom
## Residual deviance:  30.077  on 19  degrees of freedom
## AIC: 157.34
## 
## Number of Fisher Scoring iterations: 5

Recall that the offset is just a predictor variable whose coefficient is fixed at 1. So, using the standard setup for a Poisson regression with a log link, we have: \[ \log\mathsf{E}Y_i={\bf x}_i^{\top}{\boldsymbol\beta}+\log\omega_i \] where \(\omega_i\) is the offset/exposure variable. This can be rewritten as \[ \log\mathsf{E}[Y_i/\omega_i]={\bf x}_i^{\top}{\boldsymbol\beta} \] Your underlying random variable is still \(Y_i\), but by dividing by \(\omega_i\) we've converted the LHS of the model equation to be a rate of events per unit exposure. But this division also alters the variance of the response, so we have to weight by \(\omega_i\) when fitting the model.

# use quasipoisson family to stop glm complaining about nonintegral response
summary(glm(number/duration ~ class + age + zone, family=quasipoisson, data=moped, weights=duration))

## 
## Call:
## glm(formula = number/duration ~ class + age + zone, family = quasipoisson, 
##     data = moped, weights = duration)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5001  -0.8712  -0.3153   0.8260   1.5251  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.829639   0.094807 -40.394  < 2e-16 ***
## class1      -0.252640   0.093264  -2.709  0.01392 *  
## age1         0.437661   0.118771   3.685  0.00157 ** 
## zone2        0.802747   0.140943   5.696 1.72e-05 ***
## zone3        1.428190   0.125624  11.369 6.42e-10 ***
## zone4        1.959875   0.128248  15.282 3.96e-12 ***
## zone5       -0.231218   0.277934  -0.832  0.41579    
## zone6        0.185408   0.523561   0.354  0.72714    
## zone7        0.000554   0.735258   0.001  0.99941    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 1.598048)
## 
##     Null deviance: 520.352  on 27  degrees of freedom
## Residual deviance:  30.077  on 19  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

The information on deviance is also provided. We can use the residual deviance to perform a goodness-of-fit test for the overall model. The residual deviance is the difference between the deviance of the current model and the maximum deviance of the ideal model where the predicted values are identical to the observed. Therefore, if the residual difference is small enough, the goodness of fit test will not be significant, indicating that the model fits the data. We conclude that the model fits reasonably well because the goodness-of-fit chi-squared test is not statistically significant. If the test had been statistically significant, it would indicate that the data do not fit the model well. In that situation, we may try to determine if there are omitted predictor variables, if our linearity assumption holds and/or if there is an issue of over-dispersion.

with(model.frequency, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE)))

##      res.deviance df          p
## [1,]     30.07667 19 0.05083071

We can also test the overall effect of zone by comparing the deviance of the full model with the deviance of the model excluding zone. The six degree-of-freedom chi-square test indicates that zone, taken together, is a statistically significant predictor of number.

## update model.frequency model dropping zone
model.f2 <- update(model.frequency, . ~ . - zone)
## test model differences with chi square test
anova(model.f2, model.frequency, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: number ~ class + age + offset(log(duration))
## Model 2: number ~ class + age + zone + offset(log(duration))
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        25     477.34                          
## 2        19      30.08  6   447.27 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

res_ans <- residualsAnscombe(moped$number, mu=model.frequency$fitted.values, family = poisson())
qqnorm(res_ans)
qqline(res_ans)

newdata1 <- expand.grid(1:7, factor(1:2), 1:2)
colnames(newdata1) <- c("zone", "class", "age")
#newdata1 <- subset(newdata1, subset = (zone <= age))
newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1]))
colnames(newdata1)[4] <- "duration"
newdata1 <- within(newdata1, {
    age <- factor(age)
    zone <- factor(zone)
})
newdata1$phat <- predict(model.frequency, newdata1)

ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted number of claims")

rels <- coef( model.frequency )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )
rel$rels.frequency <-
    c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")],
      c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")],
      c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])

Negative binomial regression with log link

summary(model.NB <- glm.nb(number ~ class + age + zone + offset(log(duration)), data = moped, control = glm.control(maxit = 25, trace = T)))

## Theta(1) = 474.742000, 2(Ls - Lm) = 28.390200
## Theta(2) = 323.492000, 2(Ls - Lm) = 27.719000
## Theta(3) = 323.492000, 2(Ls - Lm) = 27.711700
## Theta(4) = 284.807000, 2(Ls - Lm) = 27.440900
## Theta(5) = 284.808000, 2(Ls - Lm) = 27.439700
## Theta(6) = 271.483000, 2(Ls - Lm) = 27.331000
## Theta(7) = 271.483000, 2(Ls - Lm) = 27.330800
## Theta(8) = 266.449000, 2(Ls - Lm) = 27.287300
## Theta(9) = 266.446000, 2(Ls - Lm) = 27.287200
## Theta(10) = 264.477000, 2(Ls - Lm) = 27.269800
## Theta(11) = 264.475000, 2(Ls - Lm) = 27.269800
## Theta(12) = 263.695000, 2(Ls - Lm) = 27.262900
## Theta(13) = 263.693000, 2(Ls - Lm) = 27.262800
## Theta(14) = 263.382000, 2(Ls - Lm) = 27.260100
## Theta(15) = 263.382000, 2(Ls - Lm) = 27.260100
## Theta(16) = 263.256000, 2(Ls - Lm) = 27.258900
## Theta(17) = 263.257000, 2(Ls - Lm) = 27.258900
## Theta(18) = 263.206000, 2(Ls - Lm) = 27.258500
## Theta(19) = 263.208000, 2(Ls - Lm) = 27.258500
## Theta(20) = 263.187000, 2(Ls - Lm) = 27.258300
## Theta(21) = 263.188000, 2(Ls - Lm) = 27.258300
## Theta(22) = 263.179000, 2(Ls - Lm) = 27.258200
## Theta(23) = 263.180000, 2(Ls - Lm) = 27.258200
## Theta(24) = 263.176000, 2(Ls - Lm) = 27.258200
## Theta(25) = 263.176000, 2(Ls - Lm) = 27.258200

## Warning in glm.nb(number ~ class + age + zone + offset(log(duration)),
## data = moped, : alternation limit reached

## 
## Call:
## glm.nb(formula = number ~ class + age + zone + offset(log(duration)), 
##     data = moped, control = glm.control(maxit = 25, trace = T), 
##     init.theta = 263.1764465, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3858  -0.7748  -0.2810   0.7147   1.4625  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.85536    0.08785 -43.886  < 2e-16 ***
## class1      -0.23859    0.08190  -2.913  0.00358 ** 
## age1         0.43126    0.09957   4.331 1.48e-05 ***
## zone2        0.82014    0.12467   6.578 4.76e-11 ***
## zone3        1.44358    0.11310  12.764  < 2e-16 ***
## zone4        1.98693    0.11444  17.362  < 2e-16 ***
## zone5       -0.20870    0.22728  -0.918  0.35849    
## zone6        0.20529    0.41803   0.491  0.62336    
## zone7        0.02115    0.58396   0.036  0.97110    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(263.1762) family taken to be 1)
## 
##     Null deviance: 429.738  on 27  degrees of freedom
## Residual deviance:  27.258  on 19  degrees of freedom
## AIC: 159.17
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  263 
##           Std. Err.:  561 
## Warning while fitting theta: alternation limit reached 
## 
##  2 x log-likelihood:  -139.166

Negative binomial models assume the conditional means are not equal to the conditional variances. This inequality is captured by estimating a dispersion parameter (not shown in the output) that is held constant in a Poisson model. Thus, the Poisson model is actually nested in the negative binomial model. We can then use a likelihood ratio test to compare these two and test this model assumption. To do this, we will run our model as a Poisson.

(diff2loglik <- as.numeric(2 * (logLik(model.NB) - logLik(model.frequency))))

## [1] 0.17505

pchisq(diff2loglik, df = 1, lower.tail = FALSE)

## [1] 0.6756621

Claim amount (Severity)

ggplot(moped, aes(severity)) + geom_histogram() + scale_x_log10() + facet_grid(class ~ age, margins = TRUE, scales = "free_y")

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

ggplot(moped, aes(class, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = class, y = severity, group = 1), method = "loess")

## Warning: Removed 3 rows containing non-finite values (stat_ydensity).

ggplot(moped, aes(age, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = age, y = severity, group = 1), method = "loess")

## Warning: Removed 3 rows containing non-finite values (stat_ydensity).

ggplot(moped, aes(zone, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = zone, y = severity, group = 1), method = "loess")

## Warning: Removed 3 rows containing non-finite values (stat_ydensity).

summary(model.severity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("log"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma("log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.55662  -0.25644   0.01745   0.32310   1.42683  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.85756    0.05301 167.089  < 2e-16 ***
## class1      -0.60677    0.05494 -11.044 6.78e-09 ***
## age1         0.58397    0.06943   8.411 2.88e-07 ***
## zone2        0.06416    0.08066   0.795   0.4380    
## zone3        0.07206    0.07328   0.983   0.3401    
## zone4        0.19400    0.07472   2.596   0.0195 *  
## zone5       -0.02100    0.15890  -0.132   0.8965    
## zone6        0.19151    0.29983   0.639   0.5320    
## zone7        0.18126    0.42039   0.431   0.6721    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.521651)
## 
##     Null deviance: 109.7707  on 24  degrees of freedom
## Residual deviance:   7.9998  on 16  degrees of freedom
## AIC: 12377
## 
## Number of Fisher Scoring iterations: 5

(sev.resid <- as.vector(residuals(model.severity)))

##  [1]  0.75073118  0.01745143  1.42683169  0.09215516  0.20184252
##  [6] -0.57298737 -0.25643766 -0.14864016 -0.20377173 -0.17560413
## [11]  0.39737962 -0.51922734 -0.24852120 -0.47166985 -0.36223425
## [16] -1.55662151  0.52134777 -0.24190563  0.16008063  0.32310306
## [21]  0.18797378  0.02994634 -0.32529267  0.81065248  0.58028815

diagnostics <- as.data.frame(cbind(sev.resid,moped$severity[moped$severity > 0]))
colnames(diagnostics) <- c("residuals","observed")
qplot(observed, residuals, size = I(4), data = diagnostics)

qplot(observed, residuals^2, size = I(4), data = diagnostics)

res_pearson <- residuals(model.severity, "pearson") 
qplot(diagnostics$observed, res_pearson, size = I(4), xlab="Y", ylab="Pearson residuals")

res_deviance <- residuals(model.severity, "deviance")
qplot(diagnostics$observed, res_deviance, size = I(4), xlab="Y", ylab="Deviance residuals")

res_working <- residuals(model.severity, "working")
qplot(diagnostics$observed, res_working, size = I(4), xlab="Y", ylab="Working residuals")

res_anscombe <- residualsAnscombe(moped$severity[moped$severity > 0], mu=model.severity$fitted, Gamma("log"))
qplot(diagnostics$observed, res_anscombe, size = I(4), xlab="Y", ylab="Anscombe residuals")

qqnorm(res_anscombe)
qqline(res_anscombe)

newdata1 <- expand.grid(1:7, factor(1:2), 1:2)
colnames(newdata1) <- c("zone", "class", "age")
#newdata1 <- subset(newdata1, subset = (zone <= age))
newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1]))
colnames(newdata1)[4] <- "duration"
newdata1 <- within(newdata1, {
    age <- factor(age)
    zone <- factor(zone)
})
newdata1$phat <- predict(model.severity, newdata1)

ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted claim amount")

rels <- coef( model.severity )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )
## Aside: For the canonical link function use
## rels <- rels[1] / (rels[1] + rels[-1])

rel$rels.severity <-
    c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")],
      c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")],
      c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])

with(model.severity, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE)))

##      res.deviance df         p
## [1,]      7.99982 16 0.9488717

## update model.severity model dropping zone
model.s2 <- update(model.severity, . ~ . - zone)
## test model differences with chi square test
anova(model.s2, model.severity, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: severity ~ class + age
## Model 2: severity ~ class + age + zone
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        22    12.0631                     
## 2        16     7.9998  6   4.0633   0.2539

## simplifued severity model
summary(model.s2)

## 
## Call:
## glm(formula = severity ~ class + age, family = Gamma("log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7323  -0.4808  -0.0401   0.5229   1.3237  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.92016    0.03851 231.651  < 2e-16 ***
## class1      -0.57354    0.05421 -10.581 4.28e-10 ***
## age1         0.61521    0.07064   8.709 1.40e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.5572562)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:  12.063  on 22  degrees of freedom
## AIC: 12688
## 
## Number of Fisher Scoring iterations: 5

summary(model.sGi <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma, weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma, 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.28467  -0.32335   0.03316   0.30575   0.83049  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.465e-04  6.641e-06  22.052 2.11e-13 ***
## class1       9.039e-05  7.091e-06  12.747 8.54e-10 ***
## age1        -7.577e-05  6.483e-06 -11.688 3.01e-09 ***
## zone2       -1.092e-05  9.549e-06  -1.143    0.270    
## zone3       -7.902e-06  9.432e-06  -0.838    0.414    
## zone4       -2.217e-05  8.737e-06  -2.537    0.022 *  
## zone5        6.993e-06  2.091e-05   0.334    0.742    
## zone6       -3.576e-05  3.845e-05  -0.930    0.366    
## zone7        1.255e-05  4.395e-05   0.286    0.779    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.3756071)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:   5.877  on 16  degrees of freedom
## AIC: 12134
## 
## Number of Fisher Scoring iterations: 5

summary(model.sG2 <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("1/mu^2"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma("1/mu^2"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.41879  -0.35551   0.04919   0.31231   1.57948  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.170e-08  2.567e-09   8.453 2.69e-07 ***
## class1       2.400e-08  3.459e-09   6.937 3.34e-06 ***
## age1        -1.600e-08  2.160e-09  -7.407 1.48e-06 ***
## zone2       -3.086e-09  2.907e-09  -1.062    0.304    
## zone3       -1.362e-09  3.301e-09  -0.413    0.685    
## zone4       -3.275e-09  2.801e-09  -1.169    0.259    
## zone5        1.428e-09  8.081e-09   0.177    0.862    
## zone6       -1.071e-08  1.266e-08  -0.846    0.410    
## zone7        4.780e-09  1.691e-08   0.283    0.781    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.7625844)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:  11.806  on 16  degrees of freedom
## AIC: 12683
## 
## Number of Fisher Scoring iterations: 8

summary(model.sGidentity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("identity"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = Gamma("identity"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.75903  -0.44915  -0.02916   0.31961   2.03153  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7202.2      495.1  14.546 1.21e-10 ***
## class1       -3483.6      489.4  -7.119 2.44e-06 ***
## age1          3790.1      807.6   4.693 0.000244 ***
## zone2          366.7      664.0   0.552 0.588394    
## zone3          419.1      557.8   0.751 0.463315    
## zone4         1063.2      605.5   1.756 0.098255 .  
## zone5          217.4     1358.5   0.160 0.874887    
## zone6          710.5     2171.6   0.327 0.747773    
## zone7         2036.1     4751.9   0.428 0.674011    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1.071622)
## 
##     Null deviance: 109.771  on 24  degrees of freedom
## Residual deviance:  14.789  on 16  degrees of freedom
## AIC: 12861
## 
## Number of Fisher Scoring iterations: 7

summary(model.sNcan <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "identity"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "identity"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -13626.0   -3992.1     951.4    3346.5   24341.4  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7312.60     740.85   9.871 3.29e-08 ***
## class1      -4252.82     767.80  -5.539 4.49e-05 ***
## age1         4817.84     970.35   4.965  0.00014 ***
## zone2         632.76    1127.29   0.561  0.58236    
## zone3         496.16    1024.12   0.484  0.63462    
## zone4        1473.01    1044.22   1.411  0.17750    
## zone5        -476.89    2220.75  -0.215  0.83268    
## zone6        1849.61    4190.30   0.441  0.66483    
## zone7         -63.27    5875.10  -0.011  0.99154    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 101884490)
## 
##     Null deviance: 6820988245  on 24  degrees of freedom
## Residual deviance: 1630151847  on 16  degrees of freedom
## AIC: 478.47
## 
## Number of Fisher Scoring iterations: 2

summary(model.sNlog <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "log"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -11677.3   -2257.6     220.6    3476.8   13652.1  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.84907    0.06613 133.818  < 2e-16 ***
## class1      -0.72303    0.07300  -9.904 3.14e-08 ***
## age1         0.71334    0.06569  10.859 8.63e-09 ***
## zone2        0.13812    0.09353   1.477   0.1592    
## zone3        0.05901    0.09522   0.620   0.5442    
## zone4        0.20928    0.08801   2.378   0.0302 *  
## zone5       -0.07315    0.21110  -0.347   0.7335    
## zone6        0.37682    0.35890   1.050   0.3094    
## zone7       -0.34514    0.54065  -0.638   0.5323    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 37687021)
## 
##     Null deviance: 6.821e+09  on 24  degrees of freedom
## Residual deviance: 6.030e+08  on 16  degrees of freedom
## AIC: 453.61
## 
## Number of Fisher Scoring iterations: 5

summary(model.sNi <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "inverse"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "inverse"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -9454.8  -2351.3    798.9   2569.8   5419.4  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.476e-04  6.131e-06  24.068 5.41e-14 ***
## class1       8.657e-05  7.699e-06  11.245 5.24e-09 ***
## age1        -7.535e-05  4.945e-06 -15.237 6.04e-11 ***
## zone2       -2.090e-05  6.774e-06  -3.086  0.00709 ** 
## zone3       -4.753e-06  8.009e-06  -0.593  0.56118    
## zone4       -1.887e-05  6.635e-06  -2.844  0.01171 *  
## zone5        3.994e-06  1.811e-05   0.221  0.82817    
## zone6       -5.031e-05  2.732e-05  -1.841  0.08417 .  
## zone7        3.789e-05  4.875e-05   0.777  0.44839    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18337492)
## 
##     Null deviance: 6.821e+09  on 24  degrees of freedom
## Residual deviance: 2.934e+08  on 16  degrees of freedom
## AIC: 435.6
## 
## Number of Fisher Scoring iterations: 7

summary(model.sNis <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "1/mu^2"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = gaussian(link = "1/mu^2"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7176.4  -2421.2    -33.8   1763.1   9398.1  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.119e-08  1.803e-09  11.753 2.78e-09 ***
## class1       2.212e-08  3.286e-09   6.730 4.83e-06 ***
## age1        -1.533e-08  1.550e-09  -9.891 3.20e-08 ***
## zone2       -3.526e-09  1.539e-09  -2.290   0.0359 *  
## zone3       -8.443e-10  1.912e-09  -0.442   0.6647    
## zone4       -2.954e-09  1.543e-09  -1.914   0.0736 .  
## zone5       -1.576e-10  4.062e-09  -0.039   0.9695    
## zone6       -1.228e-08  6.148e-09  -1.997   0.0631 .  
## zone7        7.474e-09  1.480e-08   0.505   0.6205    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 25575956)
## 
##     Null deviance: 6820988245  on 24  degrees of freedom
## Residual deviance:  409209544  on 16  degrees of freedom
## AIC: 443.91
## 
## Number of Fisher Scoring iterations: 8

summary(model.sIis <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = inverse.gaussian("1/mu^2"), weights = number))

## Warning in sqrt(eta): NaNs produced

## Error: no valid set of coefficients has been found: please supply starting values

summary(model.sIlog <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = inverse.gaussian("log"), weights = number))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = inverse.gaussian("log"), 
##     data = moped[moped$severity > 0, ], weights = number)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -0.0189284  -0.0032597  -0.0008518   0.0022828   0.0133261  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.85075    0.04654 190.177  < 2e-16 ***
## class1      -0.58491    0.04720 -12.392 1.29e-09 ***
## age1         0.54405    0.06726   8.089 4.80e-07 ***
## zone2        0.05579    0.06830   0.817  0.42599    
## zone3        0.07635    0.06006   1.271  0.22181    
## zone4        0.19083    0.06302   3.028  0.00799 ** 
## zone5        0.03402    0.13641   0.249  0.80622    
## zone6        0.12829    0.23617   0.543  0.59446    
## zone7        0.37604    0.43484   0.865  0.39994    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 6.217246e-05)
## 
##     Null deviance: 0.0158876  on 24  degrees of freedom
## Residual deviance: 0.0009851  on 16  degrees of freedom
## AIC: 12165
## 
## Number of Fisher Scoring iterations: 5

library(tweedie)
# With exact zeros, Tweedie index xi must be between 1 and 2
# Fit the tweedie distribution
out <- tweedie.profile(severity ~ 1, xi.vec=seq(1.01, 1.99, length=9), do.plot=TRUE, data = moped)

## 1.01 1.1325 1.255 1.3775 1.5 1.6225 1.745 1.8675 1.99 
## .........Done.

out$xi.max

## [1] 1.15

# Plot this distribution
tweedie.plot(seq(0, max(moped$severity), length=1000), mu=mean(moped$severity), xi=out$xi.max, phi=out$phi.max)

# Fit the glm
library(statmod) # Provides tweedie family functions
summary(model.stweedie <- glm(severity ~ class + age + zone, family = tweedie(var.power=out$xi.max, link.power=0), data=moped))

## 
## Call:
## glm(formula = severity ~ class + age + zone, family = tweedie(var.power = out$xi.max, 
##     link.power = 0), data = moped)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -61.631  -13.654   -1.007    7.737   43.658  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.1395475  0.3108972  29.397  < 2e-16 ***
## class1      -0.7454037  0.2257546  -3.302  0.00375 ** 
## age1         0.2899811  0.2165155   1.339  0.19627    
## zone2        0.1221210  0.3795417   0.322  0.75115    
## zone3        0.0043121  0.3889017   0.011  0.99127    
## zone4        0.1897670  0.3744810   0.507  0.61816    
## zone5       -0.0008962  0.3893320  -0.002  0.99819    
## zone6       -0.7393180  0.4666736  -1.584  0.12964    
## zone7       -0.3506001  0.4216913  -0.831  0.41607    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Tweedie family taken to be 625.2288)
## 
##     Null deviance: 27620  on 27  degrees of freedom
## Residual deviance: 15084  on 19  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 7

Premium: Combining the models

rel$rels.pure.premium <- with(rel, rels.frequency * rels.severity)
print(rel, digits = 2)

##    rating.factor class   dur n.claims rels.frequency rels.severity
## 1  Vehicle class     1  9833      391           1.00          1.00
## 2  Vehicle class     2  8825      395           0.78          0.55
## 11   Vehicle age     1  1918      141           1.55          1.79
## 21   Vehicle age     2 16740      645           1.00          1.00
## 12          Zone     1  1451      206           7.10          1.21
## 22          Zone     2  2486      209           4.17          1.07
## 3           Zone     3  2889      132           2.23          1.07
## 4           Zone     4 10069      207           1.00          1.00
## 5           Zone     5   246        6           1.20          1.21
## 6           Zone     6  1369       23           0.79          0.98
## 7           Zone     7   148        3           1.00          1.20
##    rels.pure.premium
## 1               1.00
## 2               0.42
## 11              2.78
## 21              1.00
## 12              8.62
## 22              4.48
## 3               2.38
## 4               1.00
## 5               1.46
## 6               0.78
## 7               1.20