The template for dataexpks is available on GitHub at

https://github.com/DublinLearningGroup/dataexpks

This dataset uses the freMTPL data from the CASdatasets package, based on book “Computation Actuarial Science with R” by Arthur Charpentier.

rm(list = ls())

knitr::opts_chunk$set(tidy       = FALSE
                     ,cache      = FALSE
                     ,fig.height =     8
                     ,fig.width  =    11
                     )

library(tidyverse)
library(forcats)
library(lubridate)
library(scales)
library(GGally)
library(cowplot)
library(Rtsne)

library(CASdatasets)

options(width = 90L
       ,warn  = 1)

set.seed(42)

#source("custom_functions.R")


### clean_names() sanitises the column names of the data
clean_names <- function(colnames) {
    colnames <- gsub(" ",          "_", colnames) %>%
        gsub("/",                  "_",      .) %>%
        gsub("\\.",                "_",      .) %>%
        gsub("\\-",                "_",      .) %>%
        gsub("'",                  "",       .) %>%
        gsub("`",                  "",       .) %>%
        gsub("\\(",                "",       .) %>%
        gsub("\\)",                "",       .) %>%
        gsub("\\?",                "",       .) %>%
        gsub("\\%",                "",       .) %>%
        gsub("\u2019",             "",       .) %>%
        gsub("\u20AC",            'EUR',     .) %>%
        gsub('(.)([A-Z][a-z]+)',  '\\1_\\2', .) %>%
        gsub('([a-z0-9])([A-Z])', '\\1_\\2', .) %>%
        gsub('1_st',              '1st',     .) %>%
        gsub('2_nd',              '2nd',     .) %>%
        tolower %>%
        gsub('__+',               '_',       .)

    return(colnames)
}


### Checks if variable is a date/time
is_date <- function(.x) inherits(.x, c("POSIXt", "POSIXct", "POSIXlt", "Date", "hms"))


### Returns the category of data type passed to it
categorise_datatype <- function (x) {
    if(all(is.na(x))) return("na")

    if(is_date(x))                          "datetime"
    else if (!is.null(attributes(x)) ||
             all(is.character(x)))          "discrete"
    else if (all(is.logical(x)))            "logical"
    else                                    "continuous"
}


### create_coltype_list() splits columns into various types
create_coltype_list <- function(data_tbl) {
    coltypes <- data_tbl %>%
        map_chr(categorise_datatype)

    cat_types <- coltypes %>%
        unique %>%
        sort

    split_lst <- cat_types %>%
        map(function(x) { coltypes[coltypes %in% x] %>% names })

    names(split_lst) <- coltypes %>% unique %>% sort

    coltype_lst <- list(
        split   = split_lst
       ,columns = coltypes
    )

    return(coltype_lst)
}


### Creates a subset of rows and columns of a data frame
create_ggpairs_tbl <- function(data_tbl, sample_cols, sample_rows, verbose = FALSE) {

    ncol_data <- ncol(data_tbl)
    nrow_data <- nrow(data_tbl)

    if(ncol_data > sample_cols) {
        col_index <- sample(ncol(data_tbl), sample_cols, replace = FALSE) %>% sort
    } else {
        col_index <- 1:ncol_data
    }

    row_count <- ifelse(nrow_data > sample_rows, sample_rows, nrow_data)

    if(verbose) {
        cat(paste0(names(data_tbl)[col_index], collapse = ','))
        cat("\n")

        data_tbl %>% select(col_index) %>% glimpse
    }

    sample_tbl <- data_tbl %>%
        select(col_index) %>%
        sample_n(row_count)

    ### Check we are not missing any data
    missing_check <- (sample_tbl %>% filter(complete.cases(.)) %>% nrow) == 0

    ### Check data is not same value down column
    unique_val_count <- sample_tbl %>%
        summarise_all(function(x) x %>% unique %>% length) %>%
        gather %>%
        filter(value == 1) %>%
        nrow
    same_check <- (unique_val_count > 0)

    if(missing_check || same_check) {
        sample_tbl <- create_ggpairs_tbl(data_tbl, sample_cols, sample_rows)
    }

    return(sample_tbl)
}


### Creates outputs for samples of data if row count is above a threshold. Otherwise calculates once
### on the whole dataset
create_sampled_output <- function(fulldata_tbl, summ_func, row_count = 2000, iter_count = 5) {
    output_lst <- list()

    if((fulldata_tbl %>% nrow) > row_count) {
        for(i in 1:iter_count) {
            sample_tbl <- fulldata_tbl %>%
                sample_n(row_count)

            output_lst[[i]] <- summ_func(sample_tbl)
        }
    } else {
        output_lst[[1]] <- summ_func(fulldata_tbl)
    }

    return(output_lst)
}


### Identify outliers
identify_univariate_outliers <- function(x) {
    outlier_vals <- boxplot.stats(x)$out

    outlier_point <- x %in% outlier_vals

    return(outlier_point)
}


### Jitter numeric variable
add_jitter <- function(x) x * rlnorm(length(x), 0, 0.0001)


### Jitter numeric data to assist with uniqueness
jitter_numeric_variables <- function(data_tbl) {
    data_tbl <- data_tbl %>%
        mutate_if(is.numeric, add_jitter)

    return(data_tbl)
}

1 Introduction

This workbook performs the basic data exploration of the dataset.

dataexp_level_exclusion_threshold <- 100

dataexp_cat_level_count <- 40
dataexp_hist_bins_count <- 50

2 Load Data

First we load the dataset.

### _TEMPLATE_
### Data is loaded into dataset rawdata_tbl here

### We may wish to set column typs
#data_col_types <- cols(
#    VAR1 = col_character()
#   ,VAR2 = col_date()
#   ,VAR3 = col_number()
#)

### Data is loaded into dataset rawdata_tbl here
#rawdata_tbl <- read_csv(DATAFILE
#                       ,locale    = locale()
#                       ,col_types = data_col_types
#                       ,progress  = FALSE
#                       )
data(freMTPL2freq)
data(freMTPL2sev)

claim_total_tbl <- freMTPL2sev %>%
    group_by(IDpol) %>%
    summarise(claim_count = n()
             ,claim_total = sum(ClaimAmount)
              )

rawdata_tbl <- freMTPL2freq %>%
    left_join(claim_total_tbl, by = 'IDpol') %>%
    mutate(claim_count = ifelse(is.na(claim_count), 0, claim_count)
          ,claim_total = ifelse(is.na(claim_total), 0, claim_total))


glimpse(rawdata_tbl)
## Observations: 678,013
## Variables: 14
## $ IDpol       <dbl> 1, 3, 5, 10, 11, 13, 15, 17, 18, 21, 25, 27, 30, 32, 35, 36, 38, ...
## $ ClaimNb     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ Exposure    <dbl> 0.10, 0.77, 0.75, 0.09, 0.84, 0.52, 0.45, 0.27, 0.71, 0.15, 0.75,...
## $ Area        <fctr> D, D, B, B, B, E, E, C, C, B, B, C, D, D, E, F, A, A, A, A, A, E...
## $ VehPower    <int> 5, 5, 6, 7, 7, 6, 6, 7, 7, 7, 7, 7, 4, 4, 4, 9, 6, 6, 6, 6, 6, 7,...
## $ VehAge      <int> 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 9, 0, 2, 2, 2, 2, 2, 0,...
## $ DrivAge     <int> 55, 55, 52, 46, 46, 38, 38, 33, 33, 41, 41, 56, 27, 27, 23, 44, 3...
## $ BonusMalus  <int> 50, 50, 50, 50, 50, 50, 50, 68, 68, 50, 50, 50, 90, 90, 100, 76, ...
## $ VehBrand    <fctr> B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12,...
## $ VehGas      <chr> "Regular", "Regular", "Diesel", "Diesel", "Diesel", "Regular", "R...
## $ Density     <int> 1217, 1217, 54, 76, 76, 3003, 3003, 137, 137, 60, 60, 173, 695, 6...
## $ Region      <fctr> R82, R82, R22, R72, R72, R31, R31, R91, R91, R52, R52, R93, R72,...
## $ claim_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ claim_total <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

2.1 Perform Quick Data Cleaning

### _TEMPLATE_
### Do simple datatype transforms and save output in data_tbl
data_tbl <- rawdata_tbl

names(data_tbl) <- rawdata_tbl %>% names %>% clean_names

glimpse(data_tbl)
## Observations: 678,013
## Variables: 14
## $ i_dpol      <dbl> 1, 3, 5, 10, 11, 13, 15, 17, 18, 21, 25, 27, 30, 32, 35, 36, 38, ...
## $ claim_nb    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ exposure    <dbl> 0.10, 0.77, 0.75, 0.09, 0.84, 0.52, 0.45, 0.27, 0.71, 0.15, 0.75,...
## $ area        <fctr> D, D, B, B, B, E, E, C, C, B, B, C, D, D, E, F, A, A, A, A, A, E...
## $ veh_power   <int> 5, 5, 6, 7, 7, 6, 6, 7, 7, 7, 7, 7, 4, 4, 4, 9, 6, 6, 6, 6, 6, 7,...
## $ veh_age     <int> 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 9, 0, 2, 2, 2, 2, 2, 0,...
## $ driv_age    <int> 55, 55, 52, 46, 46, 38, 38, 33, 33, 41, 41, 56, 27, 27, 23, 44, 3...
## $ bonus_malus <int> 50, 50, 50, 50, 50, 50, 50, 68, 68, 50, 50, 50, 90, 90, 100, 76, ...
## $ veh_brand   <fctr> B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12,...
## $ veh_gas     <chr> "Regular", "Regular", "Diesel", "Diesel", "Diesel", "Regular", "R...
## $ density     <int> 1217, 1217, 54, 76, 76, 3003, 3003, 137, 137, 60, 60, 173, 695, 6...
## $ region      <fctr> R82, R82, R22, R72, R72, R31, R31, R91, R91, R52, R52, R93, R72,...
## $ claim_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ claim_total <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

2.2 Create Derived Variables

We now create derived features useful for modelling. These values are new variables calculated from existing variables in the data.

data_tbl <- data_tbl %>%
    mutate(id_pol      = i_dpol %>% as.character
          ,claim_count = ifelse(is.na(claim_count), 0, claim_count)
          ,claim_total = ifelse(is.na(claim_total), 0, claim_total)
          ,veh_power   = veh_power %>% as.character
          ,bonus_malus = bonus_malus * 0.01
           ) %>%
    select(-i_dpol)

data_tbl %>% glimpse
## Observations: 678,013
## Variables: 14
## $ claim_nb    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ exposure    <dbl> 0.10, 0.77, 0.75, 0.09, 0.84, 0.52, 0.45, 0.27, 0.71, 0.15, 0.75,...
## $ area        <fctr> D, D, B, B, B, E, E, C, C, B, B, C, D, D, E, F, A, A, A, A, A, E...
## $ veh_power   <chr> "5", "5", "6", "7", "7", "6", "6", "7", "7", "7", "7", "7", "4", ...
## $ veh_age     <int> 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 9, 0, 2, 2, 2, 2, 2, 0,...
## $ driv_age    <int> 55, 55, 52, 46, 46, 38, 38, 33, 33, 41, 41, 56, 27, 27, 23, 44, 3...
## $ bonus_malus <dbl> 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.50, 0.68, 0.68, 0.50, 0.50,...
## $ veh_brand   <fctr> B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12, B12,...
## $ veh_gas     <chr> "Regular", "Regular", "Diesel", "Diesel", "Diesel", "Regular", "R...
## $ density     <int> 1217, 1217, 54, 76, 76, 3003, 3003, 137, 137, 60, 60, 173, 695, 6...
## $ region      <fctr> R82, R82, R22, R72, R72, R31, R31, R91, R91, R52, R52, R93, R72,...
## $ claim_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ claim_total <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ id_pol      <chr> "1", "3", "5", "10", "11", "13", "15", "17", "18", "21", "25", "2...

2.3 Check Missing Values

Before we do anything with the data, we first check for missing values in the dataset. In some cases, missing data is coded by a special character rather than as a blank, so we first correct for this.

### _TEMPLATE_
### ADD CODE TO CORRECT FOR DATA ENCODING HERE

With missing data properly encoded, we now visualise the missing data in a number of different ways.

2.3.1 Univariate Missing Data

We first examine a simple univariate count of all the missing data:

row_count <- data_tbl %>% nrow

missing_univariate_tbl <- data_tbl %>%
    summarise_each(funs(sum(is.na(.)))) %>%
    gather('variable','missing_count') %>%
    mutate(missing_prop = missing_count / row_count)

ggplot(missing_univariate_tbl) +
    geom_bar(aes(x = fct_reorder(variable, -missing_prop), weight = missing_prop)) +
    scale_y_continuous(labels = comma) +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

We remove all variables where all of the entries are missing

remove_vars <- missing_univariate_tbl %>%
    filter(missing_count == row_count) %>%
    .[['variable']]

lessmiss_data_tbl <- data_tbl %>%
    select(-one_of(remove_vars))

With these columns removed, we repeat the exercise.

missing_univariate_tbl <- lessmiss_data_tbl %>%
    summarise_each(funs(sum(is.na(.)))) %>%
    gather('variable','missing_count') %>%
    mutate(missing_prop = missing_count / row_count)

ggplot(missing_univariate_tbl) +
    geom_bar(aes(x = fct_reorder(variable, -missing_prop), weight = missing_prop)) +
    scale_y_continuous(labels = comma) +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

To reduce the scale of this plot, we look at the top twenty missing data counts.

missing_univariate_top_tbl <- missing_univariate_tbl %>%
    arrange(desc(missing_count)) %>%
    top_n(n = 50, wt = missing_count)

ggplot(missing_univariate_top_tbl) +
    geom_bar(aes(x = fct_reorder(variable, -missing_prop), weight = missing_prop)) +
    scale_y_continuous(labels = comma) +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

2.3.2 Multivariate Missing Data

It is useful to get an idea of what combinations of variables tend to have variables with missing values simultaneously, so to construct a visualisation for this we create a count of all the times given combinations of variables have missing values, producing a heat map for these combination counts.

missing_plot_tbl <- rawdata_tbl %>%
    mutate_all(funs(is.na)) %>%
    mutate_all(funs(as.numeric)) %>%
    mutate(label = do.call(paste0, (.))) %>%
    group_by(label) %>%
    summarise_all(funs(sum)) %>%
    arrange(desc(label)) %>%
    select(-label) %>%
    mutate(rowid = do.call(pmax, (.))) %>%
    gather('col','count', -rowid) %>%
    mutate(Proportion = count / row_count
          ,rowid      = round(rowid / row_count, 4)
    )

ggplot(missing_plot_tbl) +
    geom_tile(aes(x = col, y = as.factor(rowid), fill = Proportion), height = 0.8) +
    scale_fill_continuous(labels = comma) +
    scale_x_discrete(position = 'top') +
    xlab("Variable") +
    ylab("Missing Value Proportion") +
    theme(axis.text.x = element_text(angle = 90))

This visualisation takes a little explaining.

Each row represents a combination of variables with simultaneous missing values. For each row in the graphic, the coloured entries show which particular variables are missing in that combination. The proportion of rows with that combination is displayed in both the label for the row and the colouring for the cells in the row.

2.4 Inspect High-level-count Categorical Variables

With the raw data loaded up we now remove obvious unique or near-unique variables that are not amenable to basic exploration and plotting.

coltype_lst <- create_coltype_list(data_tbl)

catvar_valuecount_tbl <- data_tbl %>%
    summarise_at(coltype_lst$split$discrete
                ,function(x) length(unique(x))) %>%
    gather('var_name', 'level_count') %>%
    arrange(-level_count)

print(catvar_valuecount_tbl)
##    var_name level_count
## 1    id_pol      678013
## 2    region          22
## 3 veh_power          12
## 4 veh_brand          11
## 5      area           6
## 6   veh_gas           2
row_count <- nrow(data_tbl)

cat(paste0("Dataset has ", row_count, " rows\n"))
## Dataset has 678013 rows

Now that we a table of the counts of all the categorical variables we can automatically exclude unique variables from the exploration, as the level count will match the row count.

unique_vars <- catvar_valuecount_tbl %>%
    filter(level_count == row_count) %>%
    .[["var_name"]]

print(unique_vars)
## [1] "id_pol"
explore_data_tbl <- data_tbl %>%
    select(-one_of(unique_vars))

Having removed the unique identifier variables from the dataset, we may also wish to exclude categoricals with high level counts also, so we create a vector of those variable names.

highcount_vars <- catvar_valuecount_tbl %>%
    filter(level_count >= dataexp_level_exclusion_threshold
          ,level_count < row_count) %>%
    .[["var_name"]]

cat(paste0(highcount_vars, collapse = ', '))

We now can continue doing some basic exploration of the data. We may also choose to remove some extra columns from the dataset.

### You may want to comment out these next few lines to customise which
### categoricals are kept in the exploration.
drop_vars <- c(highcount_vars)

if(length(drop_vars) > 0) {
    explore_data_tbl <- explore_data_tbl %>%
        select(-one_of(drop_vars))

    cat(paste0(drop_vars, collapse = ', '))
}

3 Univariate Data Exploration

Now that we have loaded the data we can prepare it for some basic data exploration. We first exclude the variables that are unique identifiers or similar, and tehen split the remaining variables out into various categories to help with the systematic data exploration.

coltype_lst <- create_coltype_list(explore_data_tbl)

print(coltype_lst)
## $split
## $split$continuous
## [1] "claim_nb"    "exposure"    "veh_age"     "driv_age"    "bonus_malus" "density"    
## [7] "claim_count" "claim_total"
## 
## $split$discrete
## [1] "area"      "veh_power" "veh_brand" "veh_gas"   "region"   
## 
## 
## $columns
##     claim_nb     exposure         area    veh_power      veh_age     driv_age 
## "continuous" "continuous"   "discrete"   "discrete" "continuous" "continuous" 
##  bonus_malus    veh_brand      veh_gas      density       region  claim_count 
## "continuous"   "discrete"   "discrete" "continuous"   "discrete" "continuous" 
##  claim_total 
## "continuous"

3.1 Logical Variables

Logical variables only take two values: TRUE or FALSE. It is useful to see missing data as well though, so we also plot the count of those.

logical_vars <- coltype_lst$split$logical

for(plot_varname in logical_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    na_count <- explore_data_tbl %>% .[[plot_varname]] %>% is.na %>% sum

    explore_plot <- ggplot(explore_data_tbl) +
        geom_bar(aes_(x = plot_varname)) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0('Barplot of Counts for Variable: ', plot_varname
                      ,' (', na_count, ' missing values)')) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot(explore_plot)
}

3.2 Numeric Variables

Numeric variables are usually continuous in nature, though we also have integer data.

numeric_vars <- coltype_lst$split$continuous

for(plot_varname in numeric_vars) {
    cat("--\n")
    cat(paste0(plot_varname, '\n'))

    plot_var <- explore_data_tbl %>% .[[plot_varname]]
    na_count <- plot_var %>% is.na %>% sum

    plot_var %>% summary %>% print

    explore_plot <- ggplot(explore_data_tbl) +
        geom_histogram(aes_string(x = plot_varname), bins = dataexp_hist_bins_count) +
        geom_vline(xintercept = mean  (plot_var, na.rm = TRUE), colour = 'red',   size = 1.5) +
        geom_vline(xintercept = median(plot_var, na.rm = TRUE), colour = 'green', size = 1.5) +
        xlab(plot_varname) +
        ylab("Count") +
        scale_x_continuous(labels = comma) +
        scale_y_continuous(labels = comma) +
        ggtitle(paste0('Histogram Plot for Variable: ', plot_varname
                      ,' (', na_count, ' missing values)')
               ,subtitle = '(red line is mean, green line is median)')

    print(explore_plot)
}
## --
## claim_nb
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0532  0.0000 16.0000

## --
## exposure
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00273 0.18000 0.49000 0.52875 0.99000 2.01000