This function calculates aggregates of ceteris paribus profiles based on hierarchical clustering.

cluster_profiles(x, ..., aggregate_function = mean,
  variable_type = "numerical", center = FALSE, k = 3,
  variables = NULL)

Arguments

x

a ceteris paribus explainer produced with function ceteris_paribus()

...

other explainers that shall be plotted together

aggregate_function

a function for profile aggregation. By default it's mean

variable_type

a character. If numerical then only numerical variables will be computed. If categorical then only categorical variables will be computed.

center

shall profiles be centered before clustering

k

number of clusters for the hclust function

variables

if not NULL then only variables will be presented

Value

an object of the class aggregated_profiles_explainer

Details

Find more detailes in the Clustering Profiles Chapter.

References

Predictive Models: Visual Exploration, Explanation and Debugging https://pbiecek.github.io/PM_VEE

Examples

library("DALEX") selected_passangers <- select_sample(titanic_imputed, n = 100) model_titanic_glm <- glm(survived ~ gender + age + fare, data = titanic_imputed, family = "binomial") explain_titanic_glm <- explain(model_titanic_glm, data = titanic_imputed[,-8], y = titanic_imputed[,8])
#> Preparation of a new explainer is initiated #> -> model label : lm ( default ) #> -> data : 2207 rows 7 cols #> -> target variable : 2207 values #> -> predict function : yhat.glm will be used ( default ) #> -> predicted values : numerical, min = 0.1490412 , mean = 0.3221568 , max = 0.9878987 #> -> residual function : difference between y and yhat ( default ) #> -> residuals : numerical, min = -0.8898433 , mean = 4.198546e-13 , max = 0.8448637 #> -> model_info : package stats , ver. 3.6.1 , task regression ( default ) #> A new explainer has been created!
cp_rf <- ceteris_paribus(explain_titanic_glm, selected_passangers) clust_rf <- cluster_profiles(cp_rf, k = 3, variables = "age") plot(clust_rf)
# \donttest{ library("randomForest") model_titanic_rf <- randomForest(survived ~., data = titanic_imputed)
#> Warning: The response has five or fewer unique values. Are you sure you want to do regression?
model_titanic_rf
#> #> Call: #> randomForest(formula = survived ~ ., data = titanic_imputed) #> Type of random forest: regression #> Number of trees: 500 #> No. of variables tried at each split: 2 #> #> Mean of squared residuals: 0.1401361 #> % Var explained: 35.83
explain_titanic_rf <- explain(model_titanic_rf, data = titanic_imputed[,-8], y = titanic_imputed[,8], label = "Random Forest v7")
#> Preparation of a new explainer is initiated #> -> model label : Random Forest v7 #> -> data : 2207 rows 7 cols #> -> target variable : 2207 values #> -> predict function : yhat.randomForest will be used ( default ) #> -> predicted values : numerical, min = 0.009390473 , mean = 0.322494 , max = 0.9900508 #> -> residual function : difference between y and yhat ( default ) #> -> residuals : numerical, min = -0.7952687 , mean = -0.0003372464 , max = 0.907451 #> -> model_info : package randomForest , ver. 4.6.14 , task regression ( default ) #> A new explainer has been created!
cp_rf <- ceteris_paribus(explain_titanic_rf, selected_passangers) cp_rf
#> Top profiles : #> gender age class embarked fare sibsp parch _yhat_ #> 515 male 45 2nd Southampton 10.1000 0 0 0.09953008 #> 515.1 female 45 2nd Southampton 10.1000 0 0 0.85375605 #> 604 male 17 3rd Southampton 7.1701 1 0 0.11396062 #> 604.1 female 17 3rd Southampton 7.1701 1 0 0.47817331 #> 1430 male 25 engineering crew Southampton 0.0000 0 0 0.23912540 #> 1430.1 female 25 engineering crew Southampton 0.0000 0 0 0.69120718 #> _vname_ _ids_ _label_ #> 515 gender 515 Random Forest v7 #> 515.1 gender 515 Random Forest v7 #> 604 gender 604 Random Forest v7 #> 604.1 gender 604 Random Forest v7 #> 1430 gender 1430 Random Forest v7 #> 1430.1 gender 1430 Random Forest v7 #> #> #> Top observations: #> gender age class embarked fare sibsp parch _yhat_ #> 515 male 45 2nd Southampton 10.1000 0 0 0.09953008 #> 604 male 17 3rd Southampton 7.1701 1 0 0.11396062 #> 1430 male 25 engineering crew Southampton 0.0000 0 0 0.23912540 #> 865 male 20 3rd Cherbourg 7.0406 0 0 0.11449714 #> 452 female 17 3rd Queenstown 7.1408 0 0 0.63337759 #> 1534 male 38 victualling crew Southampton 0.0000 0 0 0.15308120 #> _label_ _ids_ #> 515 Random Forest v7 1 #> 604 Random Forest v7 2 #> 1430 Random Forest v7 3 #> 865 Random Forest v7 4 #> 452 Random Forest v7 5 #> 1534 Random Forest v7 6
pdp_rf <- aggregate_profiles(cp_rf, variables = "age") head(pdp_rf)
#> Top profiles : #> _vname_ _label_ _x_ _yhat_ _ids_ #> 1 age Random Forest v7 0.1666667 0.5207070 0 #> 2 age Random Forest v7 2.0000000 0.5767911 0 #> 3 age Random Forest v7 4.0000000 0.5918950 0 #> 4 age Random Forest v7 7.0000000 0.5450187 0 #> 5 age Random Forest v7 9.0000000 0.5285279 0 #> 6 age Random Forest v7 13.0000000 0.4631297 0
clust_rf <- cluster_profiles(cp_rf, k = 3, variables = "age") head(clust_rf)
#> Top profiles : #> _vname_ _label_ _x_ _cluster_ _yhat_ _ids_ #> 1 age Random Forest v7_1 0.1666667 1 0.4657028 0 #> 2 age Random Forest v7_1 2.0000000 1 0.5395368 0 #> 3 age Random Forest v7_1 4.0000000 1 0.5533772 0 #> 4 age Random Forest v7_1 7.0000000 1 0.4992441 0 #> 5 age Random Forest v7_1 9.0000000 1 0.4782111 0 #> 6 age Random Forest v7_1 13.0000000 1 0.3948779 0
plot(clust_rf, color = "_label_") + show_aggregated_profiles(pdp_rf, color = "black", size = 3)
plot(cp_rf, color = "grey", variables = "age") + show_aggregated_profiles(clust_rf, color = "_label_", size = 2)
clust_rf <- cluster_profiles(cp_rf, k = 3, center = TRUE, variables = "age") head(clust_rf)
#> Top profiles : #> _vname_ _label_ _x_ _cluster_ _yhat_ _ids_ #> 1 age Random Forest v7_1 0.1666667 1 0.5477504 0 #> 2 age Random Forest v7_1 2.0000000 1 0.6254372 0 #> 3 age Random Forest v7_1 4.0000000 1 0.6373459 0 #> 4 age Random Forest v7_1 7.0000000 1 0.5882245 0 #> 5 age Random Forest v7_1 9.0000000 1 0.5630644 0 #> 6 age Random Forest v7_1 13.0000000 1 0.4751353 0
# }