Partial CASCO moped insurance from Wasa insurance, 1994-1999
moped <- read.table("http://www.karlin.mff.cuni.cz/~pesta/NMFM402/moped.txt", header=T) head(moped)
## class age zone duration severity number pure actual frequency ## 1 1 1 1 62.9 18256 17 4936 2049 0.27027027 ## 2 1 1 2 112.9 13632 7 845 1230 0.06200177 ## 3 1 1 3 133.1 20877 9 1411 762 0.06761833 ## 4 1 1 4 376.6 13045 7 242 396 0.01858736 ## 5 1 1 5 9.4 0 0 0 990 0.00000000 ## 6 1 1 6 70.8 15000 1 212 594 0.01412429
class
... 1
=Weight over 60kg and more than 2 gears; 2
=Otherage
... 1
=At most 1 year, 2
=2 years or morezone
... 1
=Central and semi-central parts of Sweden's three largest cities, 2
=suburbs and middle-sized towns, 3
=Lesser towns, except those in 5 or 7, 4
=Small towns and countryside, except 5-7, 5
=Northern towns, 6
=Northern countryside, 7
=Gotland (Sweden's largest island)")duration
... in yearsseverity
... Claim severity [in SEK]number
... Number of claimspure
... Pure premium [in SEK]actual
... Actual premium [in SEK; The premium for one year according to the tariff in force 1999.]frequency
... Claim frequency [in /year]moped <- within(moped, { class <- factor(class) age <- factor(age) zone <- factor(zone) }) summary(moped)
## class age zone duration severity number ## 1:14 1:14 1:4 Min. : 4.40 Min. : 0 Min. : 0.00 ## 2:14 2:14 2:4 1st Qu.: 69.67 1st Qu.: 4364 1st Qu.: 1.75 ## 3:4 Median : 177.00 Median : 6928 Median : 8.50 ## 4:4 Mean : 666.37 Mean : 7482 Mean : 28.07 ## 5:4 3rd Qu.: 818.20 3rd Qu.: 8218 3rd Qu.: 45.25 ## 6:4 Max. :5505.30 Max. :20877 Max. :136.00 ## 7:4 ## pure actual frequency ## Min. : 0.0 Min. : 119.0 Min. :0.00000 ## 1st Qu.: 114.5 1st Qu.: 238.0 1st Qu.:0.01581 ## Median : 211.5 Median : 396.0 Median :0.03334 ## Mean : 608.0 Mean : 550.0 Mean :0.06451 ## 3rd Qu.: 668.0 3rd Qu.: 645.8 3rd Qu.:0.07782 ## Max. :4936.0 Max. :2049.0 Max. :0.27027 ##
rel <- data.frame(rating.factor = c(rep("Vehicle class", nlevels(moped$class)), rep("Vehicle age", nlevels(moped$age)), rep("Zone", nlevels(moped$zone))), class = c(levels(moped$class), levels(moped$age), levels(moped$zone)), stringsAsFactors = FALSE)
foreach
package is convenient to use here, but you can of course do it with a normal loop and a couple of variables.
library("foreach")
## Calculate duration per rating factor level and also set the ## contrasts. We use foreach here to execute the loop both for its ## side-effect (setting the contrasts) and to accumulate the sums. new.cols <- foreach (rating.factor = c("class", "age", "zone"), .combine = rbind) %do% { nclaims <- tapply(moped$number, moped[[rating.factor]], sum) sums <- tapply(moped$duration, moped[[rating.factor]], sum) n.levels <- nlevels(moped[[rating.factor]]) contrasts(moped[[rating.factor]]) <- contr.treatment(n.levels)[rank(-sums, ties.method = "first"), ] data.frame(dur = sums, n.claims = nclaims) } rel <- cbind(rel, new.cols) rm(new.cols) print(rel)
## rating.factor class dur n.claims ## 1 Vehicle class 1 9833.2 391 ## 2 Vehicle class 2 8825.1 395 ## 11 Vehicle age 1 1918.4 141 ## 21 Vehicle age 2 16739.9 645 ## 12 Zone 1 1451.4 206 ## 22 Zone 2 2486.3 209 ## 3 Zone 3 2888.7 132 ## 4 Zone 4 10069.1 207 ## 5 Zone 5 246.1 6 ## 6 Zone 6 1369.2 23 ## 7 Zone 7 147.5 3
Please note: The purpose of this page is to show how to use various data analysis commands. It does not cover all aspects of the research process which actuaries are expected to do.
library(ggplot2)
ggplot(moped, aes(number, fill = class)) + geom_histogram(binwidth = 1) + facet_grid(class ~ ., margins = TRUE, scales = "free")
ggplot(moped, aes(number, fill = age)) + geom_histogram(binwidth = 1) + facet_grid(age ~ ., margins = TRUE, scales = "free")
ggplot(moped, aes(number, fill = zone)) + geom_histogram(binwidth = 1) + facet_grid(zone ~ ., margins = TRUE, scales = "free")
summary(model.frequency <- glm(number ~ class + age + zone + offset(log(duration)), data = moped, family = poisson))
## ## Call: ## glm(formula = number ~ class + age + zone + offset(log(duration)), ## family = poisson, data = moped) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.5001 -0.8712 -0.3153 0.8260 1.5251 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -3.829639 0.074997 -51.064 < 2e-16 *** ## class1 -0.252640 0.073777 -3.424 0.000616 *** ## age1 0.437661 0.093954 4.658 3.19e-06 *** ## zone2 0.802747 0.111493 7.200 6.02e-13 *** ## zone3 1.428190 0.099375 14.372 < 2e-16 *** ## zone4 1.959875 0.101451 19.319 < 2e-16 *** ## zone5 -0.231218 0.219861 -1.052 0.292958 ## zone6 0.185408 0.414164 0.448 0.654393 ## zone7 0.000554 0.581627 0.001 0.999240 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for poisson family taken to be 1) ## ## Null deviance: 520.352 on 27 degrees of freedom ## Residual deviance: 30.077 on 19 degrees of freedom ## AIC: 157.34 ## ## Number of Fisher Scoring iterations: 5
offset()
term as opposed to the weights=
argument. The offset is log(duration)
because our link function is log.
Recall that the offset is just a predictor variable whose coefficient is fixed at 1. So, using the standard setup for a Poisson regression with a log link, we have: \[ \log\mathsf{E}Y_i={\bf x}_i^{\top}{\boldsymbol\beta}+\log\omega_i \] where \(\omega_i\) is the offset/exposure variable. This can be rewritten as \[ \log\mathsf{E}[Y_i/\omega_i]={\bf x}_i^{\top}{\boldsymbol\beta} \] Your underlying random variable is still \(Y_i\), but by dividing by \(\omega_i\) we've converted the LHS of the model equation to be a rate of events per unit exposure. But this division also alters the variance of the response, so we have to weight by \(\omega_i\) when fitting the model.
# use quasipoisson family to stop glm complaining about nonintegral response summary(glm(number/duration ~ class + age + zone, family=quasipoisson, data=moped, weights=duration))
## ## Call: ## glm(formula = number/duration ~ class + age + zone, family = quasipoisson, ## data = moped, weights = duration) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.5001 -0.8712 -0.3153 0.8260 1.5251 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -3.829639 0.094807 -40.394 < 2e-16 *** ## class1 -0.252640 0.093264 -2.709 0.01392 * ## age1 0.437661 0.118771 3.685 0.00157 ** ## zone2 0.802747 0.140943 5.696 1.72e-05 *** ## zone3 1.428190 0.125624 11.369 6.42e-10 *** ## zone4 1.959875 0.128248 15.282 3.96e-12 *** ## zone5 -0.231218 0.277934 -0.832 0.41579 ## zone6 0.185408 0.523561 0.354 0.72714 ## zone7 0.000554 0.735258 0.001 0.99941 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for quasipoisson family taken to be 1.598048) ## ## Null deviance: 520.352 on 27 degrees of freedom ## Residual deviance: 30.077 on 19 degrees of freedom ## AIC: NA ## ## Number of Fisher Scoring iterations: 6
The information on deviance is also provided. We can use the residual deviance to perform a goodness-of-fit test for the overall model. The residual deviance is the difference between the deviance of the current model and the maximum deviance of the ideal model where the predicted values are identical to the observed. Therefore, if the residual difference is small enough, the goodness of fit test will not be significant, indicating that the model fits the data. We conclude that the model fits reasonably well because the goodness-of-fit chi-squared test is not statistically significant. If the test had been statistically significant, it would indicate that the data do not fit the model well. In that situation, we may try to determine if there are omitted predictor variables, if our linearity assumption holds and/or if there is an issue of over-dispersion.
with(model.frequency, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE)))
## res.deviance df p ## [1,] 30.07667 19 0.05083071
We can also test the overall effect of zone
by comparing the deviance of the full model with the deviance of the model excluding zone
. The six degree-of-freedom chi-square test indicates that zone
, taken together, is a statistically significant predictor of number
.
## update model.frequency model dropping zone model.f2 <- update(model.frequency, . ~ . - zone) ## test model differences with chi square test anova(model.f2, model.frequency, test = "Chisq")
## Analysis of Deviance Table ## ## Model 1: number ~ class + age + offset(log(duration)) ## Model 2: number ~ class + age + zone + offset(log(duration)) ## Resid. Df Resid. Dev Df Deviance Pr(>Chi) ## 1 25 477.34 ## 2 19 30.08 6 447.27 < 2.2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Diagnostics
Anscombe residuals
library(wle)
res_ans <- residualsAnscombe(moped$number, mu=model.frequency$fitted.values, family = poisson()) qqnorm(res_ans) qqline(res_ans)
Graphical prediction
newdata1 <- expand.grid(1:7, factor(1:2), 1:2) colnames(newdata1) <- c("zone", "class", "age") #newdata1 <- subset(newdata1, subset = (zone <= age)) newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1])) colnames(newdata1)[4] <- "duration" newdata1 <- within(newdata1, { age <- factor(age) zone <- factor(zone) }) newdata1$phat <- predict(model.frequency, newdata1)
ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted number of claims")
Prediction of (relative) frequency for each rating factor.
rels <- coef( model.frequency ) rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] ) rel$rels.frequency <- c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")], c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")], c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])
library(MASS)
summary(model.NB <- glm.nb(number ~ class + age + zone + offset(log(duration)), data = moped, control = glm.control(maxit = 25, trace = T)))
## Warning in glm.nb(number ~ class + age + zone + offset(log(duration)), ## data = moped, : alternation limit reached
## ## Call: ## glm.nb(formula = number ~ class + age + zone + offset(log(duration)), ## data = moped, control = glm.control(maxit = 25, trace = T), ## init.theta = 263.1764465, link = log) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.3858 -0.7748 -0.2810 0.7147 1.4625 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -3.85536 0.08785 -43.886 < 2e-16 *** ## class1 -0.23859 0.08190 -2.913 0.00358 ** ## age1 0.43126 0.09957 4.331 1.48e-05 *** ## zone2 0.82014 0.12467 6.578 4.76e-11 *** ## zone3 1.44358 0.11310 12.764 < 2e-16 *** ## zone4 1.98693 0.11444 17.362 < 2e-16 *** ## zone5 -0.20870 0.22728 -0.918 0.35849 ## zone6 0.20529 0.41803 0.491 0.62336 ## zone7 0.02115 0.58396 0.036 0.97110 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for Negative Binomial(263.1762) family taken to be 1) ## ## Null deviance: 429.738 on 27 degrees of freedom ## Residual deviance: 27.258 on 19 degrees of freedom ## AIC: 159.17 ## ## Number of Fisher Scoring iterations: 1 ## ## ## Theta: 263 ## Std. Err.: 561 ## Warning while fitting theta: alternation limit reached ## ## 2 x log-likelihood: -139.166
Negative binomial models assume the conditional means are not equal to the conditional variances. This inequality is captured by estimating a dispersion parameter (not shown in the output) that is held constant in a Poisson model. Thus, the Poisson model is actually nested in the negative binomial model. We can then use a likelihood ratio test to compare these two and test this model assumption. To do this, we will run our model as a Poisson.
(diff2loglik <- as.numeric(2 * (logLik(model.NB) - logLik(model.frequency))))
## [1] 0.17505
pchisq(diff2loglik, df = 1, lower.tail = FALSE)
## [1] 0.6756621
ggplot(moped, aes(severity)) + geom_histogram() + scale_x_log10() + facet_grid(class ~ age, margins = TRUE, scales = "free_y")
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
## Warning: position_stack requires constant width: output may be incorrect
ggplot(moped, aes(class, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = class, y = severity, group = 1), method = "loess")
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
ggplot(moped, aes(age, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = age, y = severity, group = 1), method = "loess")
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
ggplot(moped, aes(zone, severity)) + geom_violin() + geom_jitter(size = 1.5) + scale_y_log10() + stat_smooth(aes(x = zone, y = severity, group = 1), method = "loess")
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
There are a couple of points to note here:
moped[moped$severity > 0, ]
construct.log
link function even though the canonical function (inverse
) gives a slightly better fit (residual deviance 5.9 versus 8.0 on 16 degrees of freedom). The reason is the interpretability.summary(model.severity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("log"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = Gamma("log"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.55662 -0.25644 0.01745 0.32310 1.42683 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8.85756 0.05301 167.089 < 2e-16 *** ## class1 -0.60677 0.05494 -11.044 6.78e-09 *** ## age1 0.58397 0.06943 8.411 2.88e-07 *** ## zone2 0.06416 0.08066 0.795 0.4380 ## zone3 0.07206 0.07328 0.983 0.3401 ## zone4 0.19400 0.07472 2.596 0.0195 * ## zone5 -0.02100 0.15890 -0.132 0.8965 ## zone6 0.19151 0.29983 0.639 0.5320 ## zone7 0.18126 0.42039 0.431 0.6721 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for Gamma family taken to be 0.521651) ## ## Null deviance: 109.7707 on 24 degrees of freedom ## Residual deviance: 7.9998 on 16 degrees of freedom ## AIC: 12377 ## ## Number of Fisher Scoring iterations: 5
(sev.resid <- as.vector(residuals(model.severity)))
## [1] 0.75073118 0.01745143 1.42683169 0.09215516 0.20184252 ## [6] -0.57298737 -0.25643766 -0.14864016 -0.20377173 -0.17560413 ## [11] 0.39737962 -0.51922734 -0.24852120 -0.47166985 -0.36223425 ## [16] -1.55662151 0.52134777 -0.24190563 0.16008063 0.32310306 ## [21] 0.18797378 0.02994634 -0.32529267 0.81065248 0.58028815
Some diagnostics
diagnostics <- as.data.frame(cbind(sev.resid,moped$severity[moped$severity > 0])) colnames(diagnostics) <- c("residuals","observed") qplot(observed, residuals, size = I(4), data = diagnostics)
qplot(observed, residuals^2, size = I(4), data = diagnostics)
Pearson residuals
res_pearson <- residuals(model.severity, "pearson") qplot(diagnostics$observed, res_pearson, size = I(4), xlab="Y", ylab="Pearson residuals")
Deviance residuals
res_deviance <- residuals(model.severity, "deviance") qplot(diagnostics$observed, res_deviance, size = I(4), xlab="Y", ylab="Deviance residuals")
Working residuals
res_working <- residuals(model.severity, "working") qplot(diagnostics$observed, res_working, size = I(4), xlab="Y", ylab="Working residuals")
Anscombe residuals
res_anscombe <- residualsAnscombe(moped$severity[moped$severity > 0], mu=model.severity$fitted, Gamma("log")) qplot(diagnostics$observed, res_anscombe, size = I(4), xlab="Y", ylab="Anscombe residuals")
qqnorm(res_anscombe) qqline(res_anscombe)
Prediction
newdata1 <- expand.grid(1:7, factor(1:2), 1:2) colnames(newdata1) <- c("zone", "class", "age") #newdata1 <- subset(newdata1, subset = (zone <= age)) newdata1 <- cbind(newdata1,rep(median(moped$duration), dim(newdata1)[1])) colnames(newdata1)[4] <- "duration" newdata1 <- within(newdata1, { age <- factor(age) zone <- factor(zone) }) newdata1$phat <- predict(model.severity, newdata1)
ggplot(newdata1, aes(x = zone, y = phat, colour = factor(age))) + geom_point(size = 6) + facet_wrap(~class) + labs(x = "Zone", y = "Predicted claim amount")
rels <- coef( model.severity ) rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] ) ## Aside: For the canonical link function use ## rels <- rels[1] / (rels[1] + rels[-1]) rel$rels.severity <- c(c(1, rels[1])[rank(-rel$dur[1:2], ties.method = "first")], c(1, rels[2])[rank(-rel$dur[3:4], ties.method = "first")], c(1, rels[3:8])[rank(-rel$dur[5:11], ties.method = "first")])
Other posibilities GoF test.
with(model.severity, cbind(res.deviance = deviance, df = df.residual, p = pchisq(deviance, df.residual, lower.tail = FALSE)))
## res.deviance df p ## [1,] 7.99982 16 0.9488717
Submodel testing.
## update model.severity model dropping zone model.s2 <- update(model.severity, . ~ . - zone) ## test model differences with chi square test anova(model.s2, model.severity, test = "Chisq")
## Analysis of Deviance Table ## ## Model 1: severity ~ class + age ## Model 2: severity ~ class + age + zone ## Resid. Df Resid. Dev Df Deviance Pr(>Chi) ## 1 22 12.0631 ## 2 16 7.9998 6 4.0633 0.2539
## simplifued severity model summary(model.s2)
## ## Call: ## glm(formula = severity ~ class + age, family = Gamma("log"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.7323 -0.4808 -0.0401 0.5229 1.3237 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8.92016 0.03851 231.651 < 2e-16 *** ## class1 -0.57354 0.05421 -10.581 4.28e-10 *** ## age1 0.61521 0.07064 8.709 1.40e-08 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for Gamma family taken to be 0.5572562) ## ## Null deviance: 109.771 on 24 degrees of freedom ## Residual deviance: 12.063 on 22 degrees of freedom ## AIC: 12688 ## ## Number of Fisher Scoring iterations: 5
Gamma with inverse link (canonical)
summary(model.sGi <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma, weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = Gamma, ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.28467 -0.32335 0.03316 0.30575 0.83049 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.465e-04 6.641e-06 22.052 2.11e-13 *** ## class1 9.039e-05 7.091e-06 12.747 8.54e-10 *** ## age1 -7.577e-05 6.483e-06 -11.688 3.01e-09 *** ## zone2 -1.092e-05 9.549e-06 -1.143 0.270 ## zone3 -7.902e-06 9.432e-06 -0.838 0.414 ## zone4 -2.217e-05 8.737e-06 -2.537 0.022 * ## zone5 6.993e-06 2.091e-05 0.334 0.742 ## zone6 -3.576e-05 3.845e-05 -0.930 0.366 ## zone7 1.255e-05 4.395e-05 0.286 0.779 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for Gamma family taken to be 0.3756071) ## ## Null deviance: 109.771 on 24 degrees of freedom ## Residual deviance: 5.877 on 16 degrees of freedom ## AIC: 12134 ## ## Number of Fisher Scoring iterations: 5
Gamma with inverse squared link
summary(model.sG2 <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("1/mu^2"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = Gamma("1/mu^2"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.41879 -0.35551 0.04919 0.31231 1.57948 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.170e-08 2.567e-09 8.453 2.69e-07 *** ## class1 2.400e-08 3.459e-09 6.937 3.34e-06 *** ## age1 -1.600e-08 2.160e-09 -7.407 1.48e-06 *** ## zone2 -3.086e-09 2.907e-09 -1.062 0.304 ## zone3 -1.362e-09 3.301e-09 -0.413 0.685 ## zone4 -3.275e-09 2.801e-09 -1.169 0.259 ## zone5 1.428e-09 8.081e-09 0.177 0.862 ## zone6 -1.071e-08 1.266e-08 -0.846 0.410 ## zone7 4.780e-09 1.691e-08 0.283 0.781 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for Gamma family taken to be 0.7625844) ## ## Null deviance: 109.771 on 24 degrees of freedom ## Residual deviance: 11.806 on 16 degrees of freedom ## AIC: 12683 ## ## Number of Fisher Scoring iterations: 8
Gamma with identity link
summary(model.sGidentity <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = Gamma("identity"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = Gamma("identity"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.75903 -0.44915 -0.02916 0.31961 2.03153 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 7202.2 495.1 14.546 1.21e-10 *** ## class1 -3483.6 489.4 -7.119 2.44e-06 *** ## age1 3790.1 807.6 4.693 0.000244 *** ## zone2 366.7 664.0 0.552 0.588394 ## zone3 419.1 557.8 0.751 0.463315 ## zone4 1063.2 605.5 1.756 0.098255 . ## zone5 217.4 1358.5 0.160 0.874887 ## zone6 710.5 2171.6 0.327 0.747773 ## zone7 2036.1 4751.9 0.428 0.674011 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for Gamma family taken to be 1.071622) ## ## Null deviance: 109.771 on 24 degrees of freedom ## Residual deviance: 14.789 on 16 degrees of freedom ## AIC: 12861 ## ## Number of Fisher Scoring iterations: 7
Normal with identity link (canonical)
summary(model.sNcan <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "identity"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = gaussian(link = "identity"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -13626.0 -3992.1 951.4 3346.5 24341.4 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 7312.60 740.85 9.871 3.29e-08 *** ## class1 -4252.82 767.80 -5.539 4.49e-05 *** ## age1 4817.84 970.35 4.965 0.00014 *** ## zone2 632.76 1127.29 0.561 0.58236 ## zone3 496.16 1024.12 0.484 0.63462 ## zone4 1473.01 1044.22 1.411 0.17750 ## zone5 -476.89 2220.75 -0.215 0.83268 ## zone6 1849.61 4190.30 0.441 0.66483 ## zone7 -63.27 5875.10 -0.011 0.99154 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for gaussian family taken to be 101884490) ## ## Null deviance: 6820988245 on 24 degrees of freedom ## Residual deviance: 1630151847 on 16 degrees of freedom ## AIC: 478.47 ## ## Number of Fisher Scoring iterations: 2
Normal with log link
summary(model.sNlog <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "log"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = gaussian(link = "log"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -11677.3 -2257.6 220.6 3476.8 13652.1 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8.84907 0.06613 133.818 < 2e-16 *** ## class1 -0.72303 0.07300 -9.904 3.14e-08 *** ## age1 0.71334 0.06569 10.859 8.63e-09 *** ## zone2 0.13812 0.09353 1.477 0.1592 ## zone3 0.05901 0.09522 0.620 0.5442 ## zone4 0.20928 0.08801 2.378 0.0302 * ## zone5 -0.07315 0.21110 -0.347 0.7335 ## zone6 0.37682 0.35890 1.050 0.3094 ## zone7 -0.34514 0.54065 -0.638 0.5323 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for gaussian family taken to be 37687021) ## ## Null deviance: 6.821e+09 on 24 degrees of freedom ## Residual deviance: 6.030e+08 on 16 degrees of freedom ## AIC: 453.61 ## ## Number of Fisher Scoring iterations: 5
Normal with inverse link
summary(model.sNi <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "inverse"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = gaussian(link = "inverse"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -9454.8 -2351.3 798.9 2569.8 5419.4 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.476e-04 6.131e-06 24.068 5.41e-14 *** ## class1 8.657e-05 7.699e-06 11.245 5.24e-09 *** ## age1 -7.535e-05 4.945e-06 -15.237 6.04e-11 *** ## zone2 -2.090e-05 6.774e-06 -3.086 0.00709 ** ## zone3 -4.753e-06 8.009e-06 -0.593 0.56118 ## zone4 -1.887e-05 6.635e-06 -2.844 0.01171 * ## zone5 3.994e-06 1.811e-05 0.221 0.82817 ## zone6 -5.031e-05 2.732e-05 -1.841 0.08417 . ## zone7 3.789e-05 4.875e-05 0.777 0.44839 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for gaussian family taken to be 18337492) ## ## Null deviance: 6.821e+09 on 24 degrees of freedom ## Residual deviance: 2.934e+08 on 16 degrees of freedom ## AIC: 435.6 ## ## Number of Fisher Scoring iterations: 7
Normal with inverse squared link
summary(model.sNis <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = gaussian(link = "1/mu^2"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = gaussian(link = "1/mu^2"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -7176.4 -2421.2 -33.8 1763.1 9398.1 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.119e-08 1.803e-09 11.753 2.78e-09 *** ## class1 2.212e-08 3.286e-09 6.730 4.83e-06 *** ## age1 -1.533e-08 1.550e-09 -9.891 3.20e-08 *** ## zone2 -3.526e-09 1.539e-09 -2.290 0.0359 * ## zone3 -8.443e-10 1.912e-09 -0.442 0.6647 ## zone4 -2.954e-09 1.543e-09 -1.914 0.0736 . ## zone5 -1.576e-10 4.062e-09 -0.039 0.9695 ## zone6 -1.228e-08 6.148e-09 -1.997 0.0631 . ## zone7 7.474e-09 1.480e-08 0.505 0.6205 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for gaussian family taken to be 25575956) ## ## Null deviance: 6820988245 on 24 degrees of freedom ## Residual deviance: 409209544 on 16 degrees of freedom ## AIC: 443.91 ## ## Number of Fisher Scoring iterations: 8
Inverse gaussian with inverse squared link (canonical)
summary(model.sIis <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = inverse.gaussian("1/mu^2"), weights = number))
## Warning in sqrt(eta): NaNs produced
## Error: no valid set of coefficients has been found: please supply starting values
Inverse gaussian with log link
summary(model.sIlog <- glm(severity ~ class + age + zone, data = moped[moped$severity > 0, ], family = inverse.gaussian("log"), weights = number))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = inverse.gaussian("log"), ## data = moped[moped$severity > 0, ], weights = number) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -0.0189284 -0.0032597 -0.0008518 0.0022828 0.0133261 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8.85075 0.04654 190.177 < 2e-16 *** ## class1 -0.58491 0.04720 -12.392 1.29e-09 *** ## age1 0.54405 0.06726 8.089 4.80e-07 *** ## zone2 0.05579 0.06830 0.817 0.42599 ## zone3 0.07635 0.06006 1.271 0.22181 ## zone4 0.19083 0.06302 3.028 0.00799 ** ## zone5 0.03402 0.13641 0.249 0.80622 ## zone6 0.12829 0.23617 0.543 0.59446 ## zone7 0.37604 0.43484 0.865 0.39994 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for inverse.gaussian family taken to be 6.217246e-05) ## ## Null deviance: 0.0158876 on 24 degrees of freedom ## Residual deviance: 0.0009851 on 16 degrees of freedom ## AIC: 12165 ## ## Number of Fisher Scoring iterations: 5
Tweedie as compound Poisson-gamma distribution
library(tweedie) # With exact zeros, Tweedie index xi must be between 1 and 2 # Fit the tweedie distribution out <- tweedie.profile(severity ~ 1, xi.vec=seq(1.01, 1.99, length=9), do.plot=TRUE, data = moped)
## 1.01 1.1325 1.255 1.3775 1.5 1.6225 1.745 1.8675 1.99 ## .........Done.
out$xi.max
## [1] 1.15
# Plot this distribution tweedie.plot(seq(0, max(moped$severity), length=1000), mu=mean(moped$severity), xi=out$xi.max, phi=out$phi.max)
# Fit the glm library(statmod) # Provides tweedie family functions summary(model.stweedie <- glm(severity ~ class + age + zone, family = tweedie(var.power=out$xi.max, link.power=0), data=moped))
## ## Call: ## glm(formula = severity ~ class + age + zone, family = tweedie(var.power = out$xi.max, ## link.power = 0), data = moped) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -61.631 -13.654 -1.007 7.737 43.658 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 9.1395475 0.3108972 29.397 < 2e-16 *** ## class1 -0.7454037 0.2257546 -3.302 0.00375 ** ## age1 0.2899811 0.2165155 1.339 0.19627 ## zone2 0.1221210 0.3795417 0.322 0.75115 ## zone3 0.0043121 0.3889017 0.011 0.99127 ## zone4 0.1897670 0.3744810 0.507 0.61816 ## zone5 -0.0008962 0.3893320 -0.002 0.99819 ## zone6 -0.7393180 0.4666736 -1.584 0.12964 ## zone7 -0.3506001 0.4216913 -0.831 0.41607 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for Tweedie family taken to be 625.2288) ## ## Null deviance: 27620 on 27 degrees of freedom ## Residual deviance: 15084 on 19 degrees of freedom ## AIC: NA ## ## Number of Fisher Scoring iterations: 7
Now it is trivial to combine and display the results.
rel$rels.pure.premium <- with(rel, rels.frequency * rels.severity) print(rel, digits = 2)
## rating.factor class dur n.claims rels.frequency rels.severity ## 1 Vehicle class 1 9833 391 1.00 1.00 ## 2 Vehicle class 2 8825 395 0.78 0.55 ## 11 Vehicle age 1 1918 141 1.55 1.79 ## 21 Vehicle age 2 16740 645 1.00 1.00 ## 12 Zone 1 1451 206 7.10 1.21 ## 22 Zone 2 2486 209 4.17 1.07 ## 3 Zone 3 2889 132 2.23 1.07 ## 4 Zone 4 10069 207 1.00 1.00 ## 5 Zone 5 246 6 1.20 1.21 ## 6 Zone 6 1369 23 0.79 0.98 ## 7 Zone 7 148 3 1.00 1.20 ## rels.pure.premium ## 1 1.00 ## 2 0.42 ## 11 2.78 ## 21 1.00 ## 12 8.62 ## 22 4.48 ## 3 2.38 ## 4 1.00 ## 5 1.46 ## 6 0.78 ## 7 1.20