From 2efdb99918cc92738e7de275fcda5a1e4ff88d1b Mon Sep 17 00:00:00 2001 From: sumny Date: Wed, 30 Sep 2020 22:42:15 +0200 Subject: [PATCH 1/8] add experimental other resamplings, handle duplicates and missings --- R/PipeOpLearnerCV.R | 81 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 11 deletions(-) diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R index b8aba7bf6..fc1134081 100644 --- a/R/PipeOpLearnerCV.R +++ b/R/PipeOpLearnerCV.R @@ -110,6 +110,7 @@ #' graph$pipeops$classif.rpart$learner$predict_type = "prob" #' #' graph$train(task) +# FIXME: docs and tests PipeOpLearnerCV = R6Class("PipeOpLearnerCV", inherit = PipeOpTaskPreproc, public = list( @@ -121,11 +122,15 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", task_type = mlr_reflections$task_types[get("type") == private$.learner$task_type][order(get("package"))][1L]$task private$.crossval_param_set = ParamSet$new(params = list( - ParamFct$new("method", levels = c("cv", "insample"), tags = c("train", "required")), + ParamFct$new("method", levels = c("bootstrap", "custom", "cv", "holdout", "insample", "loo", "repeated_cv", "subsampling"), tags = c("train", "required")), + ParamInt$new("repeats", lower = 1L, tags = c("train", "required")), ParamInt$new("folds", lower = 2L, upper = Inf, tags = c("train", "required")), - ParamLgl$new("keep_response", tags = c("train", "required")) + ParamDbl$new("ratio", lower = 0, upper = 1, tags = c("train", "required")), + ParamLgl$new("keep_response", tags = c("train", "required")), + ParamUty$new("train_sets", tags = "train", custom_check = function(x) check_list(types = "atomicvector", any.missing = FALSE)), + ParamUty$new("test_sets", tags = "train", custom_check = function(x) check_list(types = "atomicvector", any.missing = FALSE)) )) - private$.crossval_param_set$values = list(method = "cv", folds = 3, keep_response = FALSE) + private$.crossval_param_set$values = list(method = "cv", repeats = 30L, folds = 3, ratio = 2 / 3, keep_response = FALSE) private$.crossval_param_set$set_id = "resampling" # Dependencies in paradox have been broken from the start and this is known since at least a year: # https://github.com/mlr-org/paradox/issues/216 @@ -169,14 +174,45 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", self$state = private$.learner$train(task)$state pv = private$.crossval_param_set$values - # Compute CV Predictions - if (pv$method != "insample") { - rdesc = mlr_resamplings$get(pv$method) - if (pv$method == "cv") rdesc$param_set$values = list(folds = pv$folds) - rr = resample(task, private$.learner, rdesc) - prds = as.data.table(rr$prediction(predict_sets = "test")) - } else { - prds = as.data.table(private$.learner$predict(task)) + if (pv$method == "insample") { + return(private$pred_to_task(as.data.table(private$.learner$predict(task)), task)) # early exit + } + + # Compute resampled Predictions + rdesc = mlr_resamplings$get(pv$method) + rdesc$param_set$values = switch(pv$method, + "bootstrap" = list(repeats = pv$repeats, ratio = pv$ratio), + "custom" = list(), + "cv" = list(folds = pv$folds), + "holdout" = list(ratio = pv$ratio), + "loo" = list(), + "repeated_cv" = list(repeats = pv$repeats, folds = pv$folds), + "subsampling" = list(repeats = pv$repeats, ratio = pv$ratio)) + if (pv$method == "custom") { + rdesc$instantiate(task, train_sets = private$.crossval_param_set$values$train_sets, test_sets = private$.crossval_param_set$values$test_sets) + } + rr = resample(task, private$.learner, rdesc) + prds = as.data.table(rr$prediction(predict_sets = "test")) + nrows_duplicated = length(prds$row_id[duplicated(prds$row_id)]) + missing_rows = setdiff(task$row_ids, prds$row_id) + nrows_missing = length(setdiff(task$row_ids, prds$row_id)) + + if (nrows_duplicated || nrows_missing) { # duplicates or missings + SDcols = setdiff(colnames(prds), c("row_id", "truth")) + prds_corrected = if (nrows_duplicated) { + prds[, map(.SD, aggregation), by = "row_id", .SDcols = SDcols] + } else { + setNames(data.table(matrix(nrow = 0L, ncol = NCOL(prds))), colnames(prds)) + } + prds_extended = as.list(prds_corrected)[SDcols] + prds_extended = map(prds_extended, add_missings, len = nrows_missing) + prds_extended[["row_id"]] = c(prds_corrected[["row_id"]], missing_rows) + prds = setDT(prds_extended) + + target = task$truth(prds$row_id) + if (task$task_type == "classif") { + prds$response = factor(prds$response, levels = levels(target), ordered = is.ordered(target)) + } } private$pred_to_task(prds, task) @@ -204,4 +240,27 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", ) ) +# Helper function for aggregating predictions if duplicated rows are present: +# - handles response, prob etc. naturally +# - if x is a factor (e.g., response if classif) take the mode and return this level as a character (factor fix is applied later) +# - if x is numeric (e.g., response if regr, or prob or se), take the mean (for prob this is invariant w.r.t to [0, 1] boundaries) +aggregation = function(x) { + if (length(x) == 1L) { + return(x) # early exit + } + if (is.factor(x)) { + tt = table(x) + names(tt[which.max(tt)]) + } else { + mean(x, na.rm = TRUE) + } +} + +# Helper function to add missings to predictions based on their storage mode +add_missings = function(x, len) { + c(x, switch(typeof(x), + "character" = rep_len(NA_character_, length.out = len), + "double" = rep_len(NA_real_, length.out = len))) +} + mlr_pipeops$add("learner_cv", PipeOpLearnerCV, list(R6Class("Learner", public = list(id = "learner_cv", task_type = "classif", param_set = ParamSet$new()))$new())) From b8e1deb157ae343a15df1060898ef24950fef3e6 Mon Sep 17 00:00:00 2001 From: sumny Date: Thu, 1 Oct 2020 16:51:08 +0200 Subject: [PATCH 2/8] extend PipeOpLearnerCV to other resamplings, add tests, update docs --- R/PipeOpLearnerCV.R | 126 +++++++++++++-------- man/Graph.Rd | 4 +- man/mlr_pipeops_histbin.Rd | 2 +- man/mlr_pipeops_learner_cv.Rd | 31 ++++-- man/mlr_pipeops_nmf.Rd | 2 +- man/mlr_pipeops_targetmutate.Rd | 2 +- man/mlr_pipeops_tunethreshold.Rd | 4 +- tests/testthat/test_pipeop_learnercv.R | 145 ++++++++++++++++++++++++- 8 files changed, 250 insertions(+), 66 deletions(-) diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R index fc1134081..365bec0d2 100644 --- a/R/PipeOpLearnerCV.R +++ b/R/PipeOpLearnerCV.R @@ -7,7 +7,7 @@ #' @description #' Wraps an [`mlr3::Learner`] into a [`PipeOp`]. #' -#' Returns cross-validated predictions during training as a [`Task`][mlr3::Task] and stores a model of the +#' Returns resampled predictions during training as a [`Task`][mlr3::Task] and stores a model of the #' [`Learner`][mlr3::Learner] trained on the whole data in `$state`. This is used to create a similar #' [`Task`][mlr3::Task] during prediction. #' @@ -19,7 +19,7 @@ #' Inherits the `$param_set` (and therefore `$param_set$values`) from the [`Learner`][mlr3::Learner] it is constructed from. #' #' [`PipeOpLearnerCV`] can be used to create "stacking" or "super learning" [`Graph`]s that use the output of one [`Learner`][mlr3::Learner] -#' as feature for another [`Learner`][mlr3::Learner]. Because the [`PipeOpLearnerCV`] erases the original input features, it is often +#' as features for another [`Learner`][mlr3::Learner]. Because the [`PipeOpLearnerCV`] erases the original input features, it is often #' useful to use [`PipeOpFeatureUnion`] to bind the prediction [`Task`][mlr3::Task] to the original input [`Task`][mlr3::Task]. #' #' @section Construction: @@ -28,8 +28,7 @@ #' ``` #' #' * `learner` :: [`Learner`][mlr3::Learner] \cr -#' [`Learner`][mlr3::Learner] to use for cross validation / prediction, or a string identifying a -#' [`Learner`][mlr3::Learner] in the [`mlr3::mlr_learners`] [`Dictionary`][mlr3misc::Dictionary]. +#' [`Learner`][mlr3::Learner] to use for resampling / prediction. #' * `id` :: `character(1)` #' Identifier of the resulting object, internally defaulting to the `id` of the [`Learner`][mlr3::Learner] being wrapped. #' * `param_vals` :: named `list`\cr @@ -43,7 +42,7 @@ #' type given to `learner` during construction; both during training and prediction. #' #' The output is a task with the same target as the input task, with features replaced by predictions made by the [`Learner`][mlr3::Learner]. -#' During training, this prediction is the out-of-sample prediction made by [`resample`][mlr3::resample], during prediction, this is the +#' During training, this prediction is the prediction made by [`resample`][mlr3::resample], during prediction, this is the #' ordinary prediction made on the data by a [`Learner`][mlr3::Learner] trained on the training phase data. #' #' @section State: @@ -64,10 +63,24 @@ #' The parameters are the parameters inherited from the [`PipeOpTaskPreproc`], as well as the parameters of the [`Learner`][mlr3::Learner] wrapped by this object. #' Besides that, parameters introduced are: #' * `resampling.method` :: `character(1)`\cr -#' Which resampling method do we want to use. Currently only supports `"cv"` and `"insample"`. `"insample"` generates -#' predictions with the model trained on all training data. -#' * `resampling.folds` :: `numeric(1)`\cr -#' Number of cross validation folds. Initialized to 3. Only used for `resampling.method = "cv"`. +#' Which resampling method to use. Supports `"cv"`,`"bootstrap"`, `"holdout"`, `"loo"`, `"repeated_cv"`, `"subsampling"`, `"custom"` and `"insample"`. +#' See [`mlr_resamplings`][mlr3::mlr_resamplings]. +#' `"insample"` generates predictions with the model trained on all training data. +#' In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean +#' (execpt for the `"response"` in the case of a [classification Task][mlr3::TaskClassif] which is aggregated using the mode). +#' In the case of the resampling method not returning predictions for all row ids as given in the input [`Task`][mlr3::Task], these predictions are added as missing. +#' * `resampling.repeats` :: `integer(1)`\cr +#' Number of repetitions. Initialized to 30. Only used for `resampling.method = "bootstrap"`, or `"repeated_cv"`, or `"subsampling"`. +#' * `resampling.folds` :: `integer(1)`\cr +#' Number of cross validation folds. Initialized to 3. Only used for `resampling.method = "cv"`, or `"repeated_cv"`. +#' * `resampling.ratio` :: `numeric(1)`\cr +#' Ratio of observations to put into the training set. Initialized to 2/3. Only used for `resampling.method = "bootstrap"`, or `"holdout"` or `"subsampling"`. +#' * `resampling.custom.train_sets` :: `list()`\cr +#' List with row ids for training, one list element per iteration. Must have the same length as `resampling.custom.test_sets`. +#' Only used for `resampling.method = "custom"`. +#' * `resampling.custom.test_sets` :: `list()`\cr +#' List with row ids for testing, one list element per iteration. Must have the same length as `resampling.custom.train_sets`. +#' Only used for `resampling.method = "custom"`. #' * `keep_response` :: `logical(1)`\cr #' Only effective during `"prob"` prediction: Whether to keep response values, if available. Initialized to `FALSE`. #' @@ -110,7 +123,6 @@ #' graph$pipeops$classif.rpart$learner$predict_type = "prob" #' #' graph$train(task) -# FIXME: docs and tests PipeOpLearnerCV = R6Class("PipeOpLearnerCV", inherit = PipeOpTaskPreproc, public = list( @@ -127,10 +139,10 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", ParamInt$new("folds", lower = 2L, upper = Inf, tags = c("train", "required")), ParamDbl$new("ratio", lower = 0, upper = 1, tags = c("train", "required")), ParamLgl$new("keep_response", tags = c("train", "required")), - ParamUty$new("train_sets", tags = "train", custom_check = function(x) check_list(types = "atomicvector", any.missing = FALSE)), - ParamUty$new("test_sets", tags = "train", custom_check = function(x) check_list(types = "atomicvector", any.missing = FALSE)) + ParamUty$new("custom.train_sets", tags = "train", custom_check = function(x) check_list(x, types = "atomicvector", any.missing = FALSE)), + ParamUty$new("custom.test_sets", tags = "train", custom_check = function(x) check_list(x, types = "atomicvector", any.missing = FALSE)) )) - private$.crossval_param_set$values = list(method = "cv", repeats = 30L, folds = 3, ratio = 2 / 3, keep_response = FALSE) + private$.crossval_param_set$values = list(method = "cv", repeats = 30L, folds = 3L, ratio = 2 / 3, keep_response = FALSE) private$.crossval_param_set$set_id = "resampling" # Dependencies in paradox have been broken from the start and this is known since at least a year: # https://github.com/mlr-org/paradox/issues/216 @@ -189,33 +201,68 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", "repeated_cv" = list(repeats = pv$repeats, folds = pv$folds), "subsampling" = list(repeats = pv$repeats, ratio = pv$ratio)) if (pv$method == "custom") { - rdesc$instantiate(task, train_sets = private$.crossval_param_set$values$train_sets, test_sets = private$.crossval_param_set$values$test_sets) + rdesc$instantiate(task, train_sets = private$.crossval_param_set$values$custom.train_sets, test_sets = private$.crossval_param_set$values$custom.test_sets) } + # FIXME: we may want to instantiate here in general for safety reasons rr = resample(task, private$.learner, rdesc) prds = as.data.table(rr$prediction(predict_sets = "test")) - nrows_duplicated = length(prds$row_id[duplicated(prds$row_id)]) + nrows_multiple = length(prds$row_id[duplicated(prds$row_id)]) missing_rows = setdiff(task$row_ids, prds$row_id) - nrows_missing = length(setdiff(task$row_ids, prds$row_id)) + nrows_missing = length(missing_rows) - if (nrows_duplicated || nrows_missing) { # duplicates or missings - SDcols = setdiff(colnames(prds), c("row_id", "truth")) - prds_corrected = if (nrows_duplicated) { - prds[, map(.SD, aggregation), by = "row_id", .SDcols = SDcols] + if (!nrows_multiple && !nrows_missing) { + return(private$pred_to_task(prds, task)) # early exit + } + + # Some resamplings will result in rows being sampled multiple times and some being missing + task_type = task$task_type + prds_names = colnames(prds) + + prds_corrected = if (nrows_multiple) { + # classif: prob, regr: response, (se) + SDcols_multiple = setdiff(prds_names, if (task_type == "classif") c("row_id", "truth", "response") else c("row_id", "truth")) + + # aggregation functions: + # - mean for prob, response (regr), se + # - mode for response (classif) + prds_corrected = prds[, map(.SD, function(x) { + if (length(x) == 1L) return(x) # early exit + mean(x, na.rm = TRUE) + }), by = "row_id", .SDcols = SDcols_multiple] + + if (NROW(prds_corrected) == 0L) prds_corrected = unique(prds[, "row_id"]) + + if (task_type == "classif") { + cbind(prds_corrected, prds[, map(.SD, function(x) { + if (length(x) == 1L) return(as.character(x)) # early exit + tt = table(x) + names(tt[which.max(tt)]) + }), by = "row_id", .SDcols = "response"][, "response"]) } else { - setNames(data.table(matrix(nrow = 0L, ncol = NCOL(prds))), colnames(prds)) + prds_corrected } - prds_extended = as.list(prds_corrected)[SDcols] - prds_extended = map(prds_extended, add_missings, len = nrows_missing) - prds_extended[["row_id"]] = c(prds_corrected[["row_id"]], missing_rows) - prds = setDT(prds_extended) - - target = task$truth(prds$row_id) - if (task$task_type == "classif") { - prds$response = factor(prds$response, levels = levels(target), ordered = is.ordered(target)) + } else { + if (task_type == "classif") { + prds[, "response" := as.character(response)] } + prds[, !"truth"] } - private$pred_to_task(prds, task) + if (nrows_missing) { + SDcols_missing = setdiff(prds_names, "truth") + # add missings + prds_corrected = prds_corrected[, map(.SD, add_missings, len = nrows_missing), .SDcols = SDcols_missing] + prds_corrected$row_id[is.na(prds_corrected$row_id)] = missing_rows + } + + if (task_type == "classif") { + target = task$truth(prds_corrected$row_id) + prds_corrected$response = factor(prds_corrected$response, levels = levels(target), ordered = is.ordered(target)) + } + + # FIXME: safety cheks? + + private$pred_to_task(prds_corrected, task) }, .predict_task = function(task) { @@ -240,27 +287,12 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", ) ) -# Helper function for aggregating predictions if duplicated rows are present: -# - handles response, prob etc. naturally -# - if x is a factor (e.g., response if classif) take the mode and return this level as a character (factor fix is applied later) -# - if x is numeric (e.g., response if regr, or prob or se), take the mean (for prob this is invariant w.r.t to [0, 1] boundaries) -aggregation = function(x) { - if (length(x) == 1L) { - return(x) # early exit - } - if (is.factor(x)) { - tt = table(x) - names(tt[which.max(tt)]) - } else { - mean(x, na.rm = TRUE) - } -} - # Helper function to add missings to predictions based on their storage mode add_missings = function(x, len) { c(x, switch(typeof(x), "character" = rep_len(NA_character_, length.out = len), - "double" = rep_len(NA_real_, length.out = len))) + "double" = rep_len(NA_real_, length.out = len), + "integer" = rep_len(NA_integer_, length.out = len))) } mlr_pipeops$add("learner_cv", PipeOpLearnerCV, list(R6Class("Learner", public = list(id = "learner_cv", task_type = "classif", param_set = ParamSet$new()))$new())) diff --git a/man/Graph.Rd b/man/Graph.Rd index 91b9fecfc..599473314 100644 --- a/man/Graph.Rd +++ b/man/Graph.Rd @@ -94,8 +94,8 @@ are therefore unambiguous, they can be omitted (i.e. left as \code{NULL}). \item \code{plot(html)} \cr (\code{logical(1)}) -> \code{NULL} \cr Plot the \code{\link{Graph}}, using either the \pkg{igraph} package (for \code{html = FALSE}, default) or -the \code{visNetwork} package for \code{html = TRUE} producing a \code{\link[htmlwidgets:htmlwidgets]{htmlWidget}}. -The \code{\link[htmlwidgets:htmlwidgets]{htmlWidget}} can be rescaled using \code{\link[visNetwork:visOptions]{visOptions}}. +the \code{visNetwork} package for \code{html = TRUE} producing a \code{\link[htmlwidgets:htmlwidgets-package]{htmlWidget}}. +The \code{\link[htmlwidgets:htmlwidgets-package]{htmlWidget}} can be rescaled using \code{\link[visNetwork:visOptions]{visOptions}}. \item \code{print(dot = FALSE, dotname = "dot", fontsize = 24L)} \cr (\code{logical(1)}, \code{character(1)}, \code{integer(1)}) -> \code{NULL} \cr Print a representation of the \code{\link{Graph}} on the console. If \code{dot} is \code{FALSE}, output is a table with one row for each contained \code{\link{PipeOp}} and diff --git a/man/mlr_pipeops_histbin.Rd b/man/mlr_pipeops_histbin.Rd index bf4d1423f..2b50a748b 100644 --- a/man/mlr_pipeops_histbin.Rd +++ b/man/mlr_pipeops_histbin.Rd @@ -49,7 +49,7 @@ Either a \code{character(1)} string naming an algorithm to compute the number of a \code{numeric(1)} giving the number of breaks for the histogram, a vector \code{numeric} giving the breakpoints between the histogram cells, or a \code{function} to compute the vector of breakpoints or to compute the number -of cells. Default is algorithm \code{"Sturges"} (see \code{\link[grDevices:nclass.Sturges]{grDevices::nclass.Sturges()}}). +of cells. Default is algorithm \code{"Sturges"} (see \code{\link[grDevices:nclass]{grDevices::nclass.Sturges()}}). For details see \code{\link[graphics:hist]{hist()}}. } } diff --git a/man/mlr_pipeops_learner_cv.Rd b/man/mlr_pipeops_learner_cv.Rd index 202904de1..db0aae79c 100644 --- a/man/mlr_pipeops_learner_cv.Rd +++ b/man/mlr_pipeops_learner_cv.Rd @@ -10,7 +10,7 @@ \description{ Wraps an \code{\link[mlr3:Learner]{mlr3::Learner}} into a \code{\link{PipeOp}}. -Returns cross-validated predictions during training as a \code{\link[mlr3:Task]{Task}} and stores a model of the +Returns resampled predictions during training as a \code{\link[mlr3:Task]{Task}} and stores a model of the \code{\link[mlr3:Learner]{Learner}} trained on the whole data in \verb{$state}. This is used to create a similar \code{\link[mlr3:Task]{Task}} during prediction. @@ -22,7 +22,7 @@ are \verb{.response} and \verb{.se}. \verb{} denotes the \verb{$id} Inherits the \verb{$param_set} (and therefore \verb{$param_set$values}) from the \code{\link[mlr3:Learner]{Learner}} it is constructed from. \code{\link{PipeOpLearnerCV}} can be used to create "stacking" or "super learning" \code{\link{Graph}}s that use the output of one \code{\link[mlr3:Learner]{Learner}} -as feature for another \code{\link[mlr3:Learner]{Learner}}. Because the \code{\link{PipeOpLearnerCV}} erases the original input features, it is often +as features for another \code{\link[mlr3:Learner]{Learner}}. Because the \code{\link{PipeOpLearnerCV}} erases the original input features, it is often useful to use \code{\link{PipeOpFeatureUnion}} to bind the prediction \code{\link[mlr3:Task]{Task}} to the original input \code{\link[mlr3:Task]{Task}}. } \section{Construction}{ @@ -30,8 +30,7 @@ useful to use \code{\link{PipeOpFeatureUnion}} to bind the prediction \code{\lin } \itemize{ \item \code{learner} :: \code{\link[mlr3:Learner]{Learner}} \cr -\code{\link[mlr3:Learner]{Learner}} to use for cross validation / prediction, or a string identifying a -\code{\link[mlr3:Learner]{Learner}} in the \code{\link[mlr3:mlr_learners]{mlr3::mlr_learners}} \code{\link[mlr3misc:Dictionary]{Dictionary}}. +\code{\link[mlr3:Learner]{Learner}} to use for resampling / prediction. \item \code{id} :: \code{character(1)} Identifier of the resulting object, internally defaulting to the \code{id} of the \code{\link[mlr3:Learner]{Learner}} being wrapped. \item \code{param_vals} :: named \code{list}\cr @@ -48,7 +47,7 @@ type given to \code{learner} during construction; both during training and predi type given to \code{learner} during construction; both during training and prediction. The output is a task with the same target as the input task, with features replaced by predictions made by the \code{\link[mlr3:Learner]{Learner}}. -During training, this prediction is the out-of-sample prediction made by \code{\link[mlr3:resample]{resample}}, during prediction, this is the +During training, this prediction is the prediction made by \code{\link[mlr3:resample]{resample}}, during prediction, this is the ordinary prediction made on the data by a \code{\link[mlr3:Learner]{Learner}} trained on the training phase data. } @@ -76,10 +75,24 @@ The parameters are the parameters inherited from the \code{\link{PipeOpTaskPrepr Besides that, parameters introduced are: \itemize{ \item \code{resampling.method} :: \code{character(1)}\cr -Which resampling method do we want to use. Currently only supports \code{"cv"} and \code{"insample"}. \code{"insample"} generates -predictions with the model trained on all training data. -\item \code{resampling.folds} :: \code{numeric(1)}\cr -Number of cross validation folds. Initialized to 3. Only used for \code{resampling.method = "cv"}. +Which resampling method to use. Supports \code{"cv"},\code{"bootstrap"}, \code{"holdout"}, \code{"loo"}, \code{"repeated_cv"}, \code{"subsampling"}, \code{"custom"} and \code{"insample"}. +See \code{\link[mlr3:mlr_resamplings]{mlr_resamplings}}. +\code{"insample"} generates predictions with the model trained on all training data. +In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean +(execpt for the \code{"response"} in the case of a \link[mlr3:TaskClassif]{classification Task} which is aggregated using the mode). +In the case of the resampling method not returning predictions for all row ids as given in the input \code{\link[mlr3:Task]{Task}}, these predictions are added as missing. +\item \code{resampling.repeats} :: \code{integer(1)}\cr +Number of repetitions. Initialized to 30. Only used for \code{resampling.method = "bootstrap"}, or \code{"repeated_cv"}, or \code{"subsampling"}. +\item \code{resampling.folds} :: \code{integer(1)}\cr +Number of cross validation folds. Initialized to 3. Only used for \code{resampling.method = "cv"}, or \code{"repeated_cv"}. +\item \code{resampling.ratio} :: \code{numeric(1)}\cr +Ratio of observations to put into the training set. Initialized to 2/3. Only used for \code{resampling.method = "bootstrap"}, or \code{"holdout"} or \code{"subsampling"}. +\item \code{resampling.custom.train_sets} :: \code{list()}\cr +List with row ids for training, one list element per iteration. Must have the same length as \code{resampling.custom.test_sets}. +Only used for \code{resampling.method = "custom"}. +\item \code{resampling.custom.test_sets} :: \code{list()}\cr +List with row ids for testing, one list element per iteration. Must have the same length as \code{resampling.custom.train_sets}. +Only used for \code{resampling.method = "custom"}. \item \code{keep_response} :: \code{logical(1)}\cr Only effective during \code{"prob"} prediction: Whether to keep response values, if available. Initialized to \code{FALSE}. } diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 721f2a45f..78a4a5140 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -59,7 +59,7 @@ to use \code{mlr3}'s \code{future}-based parallelization. \section{Internals}{ -Uses the \code{\link[NMF:nmf]{nmf}} function as well as \code{\link[NMF:basis]{basis}}, \code{\link[NMF:coef]{coef}} and +Uses the \code{\link[NMF:nmf]{nmf}} function as well as \code{\link[NMF:basis-coef-methods]{basis}}, \code{\link[NMF:basis-coef-methods]{coef}} and \code{\link[MASS:ginv]{ginv}}. } diff --git a/man/mlr_pipeops_targetmutate.Rd b/man/mlr_pipeops_targetmutate.Rd index 12f0589d2..75ca70a6c 100644 --- a/man/mlr_pipeops_targetmutate.Rd +++ b/man/mlr_pipeops_targetmutate.Rd @@ -44,7 +44,7 @@ The parameters are the parameters inherited from \code{\link{PipeOpTargetTrafo}} Transformation function for the target. Should only be a function of the target, i.e., taking a single \code{data.table} argument, typically with one column. The return value is used as the new target of the resulting \code{\link[mlr3:Task]{Task}}. To change target names, change the column name of the data -using e.g. \code{\link[data.table:setnames]{setnames()}}.\cr +using e.g. \code{\link[data.table:setattr]{setnames()}}.\cr Note that this function also gets called during prediction and should thus gracefully handle \code{NA} values.\cr Initialized to \code{identity()}. \item \code{inverter} :: \code{function} \code{data.table} -> \code{data.table} | named \code{list}\cr diff --git a/man/mlr_pipeops_tunethreshold.Rd b/man/mlr_pipeops_tunethreshold.Rd index a46a84846..7f104e6f8 100644 --- a/man/mlr_pipeops_tunethreshold.Rd +++ b/man/mlr_pipeops_tunethreshold.Rd @@ -19,7 +19,7 @@ Returns a single \code{\link[mlr3:PredictionClassif]{PredictionClassif}}. This PipeOp should be used in conjunction with \code{\link{PipeOpLearnerCV}} in order to optimize thresholds of cross-validated predictions. In order to optimize thresholds without cross-validation, use \code{\link{PipeOpLearnerCV}} -in conjunction with \code{\link[mlr3:ResamplingInsample]{ResamplingInsample}}. +in conjunction with \code{\link[mlr3:mlr_resamplings_insample]{ResamplingInsample}}. } \section{Construction}{ \preformatted{* `PipeOpTuneThreshold$new(id = "tunethreshold", param_vals = list())` \\cr @@ -58,7 +58,7 @@ Initialized to \code{"classif.ce"}, i.e. misclassification error. \item \code{optimizer} :: \code{\link[bbotk:Optimizer]{Optimizer}}|\code{character(1)}\cr \code{\link[bbotk:Optimizer]{Optimizer}} used to find optimal thresholds. If \code{character}, converts to \code{\link[bbotk:Optimizer]{Optimizer}} -via \code{\link[bbotk:opt]{opt}}. Initialized to \code{\link[bbotk:OptimizerGenSA]{OptimizerGenSA}}. +via \code{\link[bbotk:opt]{opt}}. Initialized to \code{\link[bbotk:mlr_optimizers_gensa]{OptimizerGenSA}}. \item \code{log_level} :: \code{character(1)} | \code{integer(1)}\cr Set a temporary log-level for \code{lgr::get_logger("bbotk")}. Initialized to: "warn". } diff --git a/tests/testthat/test_pipeop_learnercv.R b/tests/testthat/test_pipeop_learnercv.R index 34d0e475a..038c3fc14 100644 --- a/tests/testthat/test_pipeop_learnercv.R +++ b/tests/testthat/test_pipeop_learnercv.R @@ -42,11 +42,11 @@ test_that("PipeOpLearnerCV - param values", { lrn = mlr_learners$get("classif.rpart") polrn = PipeOpLearnerCV$new(lrn) expect_subset(c("minsplit", "resampling.method", "resampling.folds"), names(polrn$param_set$params)) - expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.folds = 3, resampling.keep_response = FALSE, xval = 0)) + expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.repeats = 30, resampling.folds = 3, resampling.ratio = 2/3, resampling.keep_response = FALSE, xval = 0)) polrn$param_set$values$minsplit = 2 - expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.folds = 3, resampling.keep_response = FALSE, minsplit = 2, xval = 0)) + expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.repeats = 30, resampling.folds = 3, resampling.ratio = 2/3, resampling.keep_response = FALSE, minsplit = 2, xval = 0)) polrn$param_set$values$resampling.folds = 4 - expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.folds = 4, resampling.keep_response = FALSE, minsplit = 2, xval = 0)) + expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.repeats = 30, resampling.folds = 4, resampling.ratio = 2/3, resampling.keep_response = FALSE, minsplit = 2, xval = 0)) }) test_that("PipeOpLearnerCV - within resampling", { @@ -98,3 +98,142 @@ test_that("PipeOpLearnerCV - model active binding to state", { expect_null(po$learner$state) expect_equal(po$learner_model$state, po$state) }) + +test_that("PipeOpLearnerCV - different methods", { + skip_on_cran() # takes too long + # Helper + test_valid_resampled_task = function(polrn, task, predict_type) { + polrn$param_set$values$resampling.keep_response = FALSE + polrn$learner$predict_type = predict_type + + train_out = polrn$train(list(task))[[1]] + train_out_data = train_out$data() + expect_identical(task$row_ids, train_out$row_ids) + + if (task$task_type == "classif") { + if (polrn$learner$predict_type == "response") { + feature = train_out$data(cols = grep("*.response", train_out$feature_names, value = TRUE))[[1L]] + expect_true(is.factor(feature)) + expect_identical(task$class_names, levels(feature)) + } else { # "prob" + features = train_out$data(cols = grep("*.prob*", train_out$feature_names, value = TRUE)) + sums = rowSums(is.na(features)) + expect_true(all(sums == 0 | sums == NCOL(features))) # either all or none missing + features = features[sums == 0, ] + expect_true(all(apply(features, MARGIN = 2L, function(x) x >= 0 & x <= 1))) # between 0 and 1 + expect_equal(rowSums(features), rep_len(1, length.out = NROW(features))) # sum is 1 + } + } else { # "regr" + if (polrn$learner$predict_type == "response") { + feature = train_out$data(cols = grep("*.response", train_out$feature_names, value = TRUE))[[1L]] + expect_true(is.numeric(feature)) + } else { # "se" + features = train_out$data(cols = grep("*.response|*.se", train_out$feature_names, value = TRUE)) + expect_true(all(apply(features, MARGIN = 2L, is.numeric))) + } + } + } + + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), param_vals = list(resampling.method = "cv", resampling.folds = 2, resampling.repeats = 2)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), param_vals = list(resampling.method = "cv", resampling.folds = 2, resampling.repeats = 2)) + + set.seed(1234) + # faster training + taskc = tsk("german_credit")$filter(sample(1000, 50)) + taskc$select("age") + taskr = tsk("boston_housing")$filter(sample(sample(506, 50))) + taskr$select("rad") + + # cv (see params above) + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + test_valid_resampled_task(polrnr, taskr, "se") + + # bootstrap + polrnc$param_set$values$resampling.method = "bootstrap" + polrnr$param_set$values$resampling.method = "bootstrap" + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + test_valid_resampled_task(polrnr, taskr, "se") + + # holdout + polrnc$param_set$values$resampling.method = "holdout" + polrnr$param_set$values$resampling.method = "holdout" + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + test_valid_resampled_task(polrnr, taskr, "se") + + # loo + polrnc$param_set$values$resampling.method = "loo" + polrnr$param_set$values$resampling.method = "loo" + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + test_valid_resampled_task(polrnr, taskr, "se") + + # repeated_cv + polrnc$param_set$values$resampling.method = "repeated_cv" + polrnr$param_set$values$resampling.method = "repeated_cv" + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + test_valid_resampled_task(polrnr, taskr, "se") + + # subsampling + polrnc$param_set$values$resampling.method = "subsampling" + polrnr$param_set$values$resampling.method = "subsampling" + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + test_valid_resampled_task(polrnr, taskr, "se") + + # custom + # classif + polrnc$param_set$values$resampling.method = "custom" + polrnc$param_set$values$resampling.custom.train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]) + polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]) # no multiples no missings + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + + polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[1:50]) # multiples but no missings + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + + polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:45]) # no multiples but missings + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + polrnc$learner$predict_type = "response" + feature_out = polrnc$train(list(taskc))[[1L]]$data(cols = "classif.rpart.response")[[1L]] + expect_true(all(which(is.na(feature_out)) == 46:50)) + polrnc$learner$predict_type = "prob" + features_out = polrnc$train(list(taskc))[[1L]]$data(cols = c("classif.rpart.prob.good", "classif.rpart.prob.bad")) + expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) + + polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[20:45]) # multiples and missings + test_valid_resampled_task(polrnc, taskc, "response") + test_valid_resampled_task(polrnc, taskc, "prob") + polrnc$learner$predict_type = "response" + feature_out = polrnc$train(list(taskc))[[1L]]$data(cols = "classif.rpart.response")[[1L]] + expect_true(all(which(is.na(feature_out)) == 46:50)) + polrnc$learner$predict_type = "prob" + features_out = polrnc$train(list(taskc))[[1L]]$data(cols = c("classif.rpart.prob.good", "classif.rpart.prob.bad")) + expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) + + # regr + polrnr$param_set$values$resampling.method = "custom" + polrnr$param_set$values$resampling.custom.train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]) + polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]) # no multiples no missings + test_valid_resampled_task(polrnr, taskr, "se") + + polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[1:50]) # multiples but no missings + test_valid_resampled_task(polrnr, taskr, "se") + + polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:45]) # no multiples but missings + test_valid_resampled_task(polrnr, taskr, "se") + polrnr$learner$predict_type = "se" + features_out = polrnr$train(list(taskr))[[1L]]$data(cols = c("regr.lm.response", "regr.lm.se")) + expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) + + polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[20:45]) # multiples and missings + test_valid_resampled_task(polrnr, taskr, "se") + polrnr$learner$predict_type = "se" + features_out = polrnr$train(list(taskr))[[1L]]$data(cols = c("regr.lm.response", "regr.lm.se")) + expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) +}) From a90616ca7157d5322b020bda61044e6c10d9ada2 Mon Sep 17 00:00:00 2001 From: sumny Date: Sun, 4 Oct 2020 20:24:31 +0200 Subject: [PATCH 3/8] allow for more flexible Resampling, fix tests and docs accordingly --- R/PipeOpLearnerCV.R | 111 ++++++++++--------------- man/mlr_pipeops_learner_cv.Rd | 42 ++++------ tests/testthat/test_pipeop_learnercv.R | 67 ++++++++------- 3 files changed, 93 insertions(+), 127 deletions(-) diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R index 365bec0d2..66494fddf 100644 --- a/R/PipeOpLearnerCV.R +++ b/R/PipeOpLearnerCV.R @@ -5,7 +5,7 @@ #' @format [`R6Class`] object inheriting from [`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @description -#' Wraps an [`mlr3::Learner`] into a [`PipeOp`]. +#' Wraps a [`mlr3::Learner`] and [`mlr3::Resampling`] into a [`PipeOp`]. #' #' Returns resampled predictions during training as a [`Task`][mlr3::Task] and stores a model of the #' [`Learner`][mlr3::Learner] trained on the whole data in `$state`. This is used to create a similar @@ -16,7 +16,13 @@ #' for `$predict.type` `"prob"` the `.prob.` features are created, and for `$predict.type` `"se"` the new columns #' are `.response` and `.se`. `` denotes the `$id` of the [`PipeOpLearnerCV`] object. #' -#' Inherits the `$param_set` (and therefore `$param_set$values`) from the [`Learner`][mlr3::Learner] it is constructed from. +#' In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean +#' (execpt for the `"response"` in the case of a [classification Task][mlr3::TaskClassif] which is aggregated using the mode). +#' In the case of the resampling method not returning predictions for all row ids as given in the input [`Task`][mlr3::Task], +#' these predictions are added as missing. +#' +#' Inherits both the `$param_set` (and therefore `$param_set$values`) from the [`Learner`][mlr3::Learner] and +#' [`Resampling`][mlr3::Resampling] it is constructed from. The parameter ids of the latter one are prefixed with `"resampling."`. #' #' [`PipeOpLearnerCV`] can be used to create "stacking" or "super learning" [`Graph`]s that use the output of one [`Learner`][mlr3::Learner] #' as features for another [`Learner`][mlr3::Learner]. Because the [`PipeOpLearnerCV`] erases the original input features, it is often @@ -24,12 +30,14 @@ #' #' @section Construction: #' ``` -#' PipeOpLearnerCV$new(learner, id = NULL, param_vals = list()) +#' PipeOpLearnerCV$new(learner, resampling = rsmp("cv", folds = 3), id = NULL, param_vals = list()) #' ``` #' #' * `learner` :: [`Learner`][mlr3::Learner] \cr #' [`Learner`][mlr3::Learner] to use for resampling / prediction. -#' * `id` :: `character(1)` +#' * `resampling` :: [`Resampling`][mlr3::Resampling] \cr +#' [`Resamling`][mlr3::Resampling] to use for resampling. Initialized to 3-fold cross-validation. +#' * `id` :: `character(1)`\cr #' Identifier of the resulting object, internally defaulting to the `id` of the [`Learner`][mlr3::Learner] being wrapped. #' * `param_vals` :: named `list`\cr #' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. @@ -42,7 +50,7 @@ #' type given to `learner` during construction; both during training and prediction. #' #' The output is a task with the same target as the input task, with features replaced by predictions made by the [`Learner`][mlr3::Learner]. -#' During training, this prediction is the prediction made by [`resample`][mlr3::resample], during prediction, this is the +#' During training, this prediction is the out-of-sample prediction made by [`resample`][mlr3::resample], during prediction, this is the #' ordinary prediction made on the data by a [`Learner`][mlr3::Learner] trained on the training phase data. #' #' @section State: @@ -60,27 +68,9 @@ #' Prediction time, in seconds. #' #' @section Parameters: -#' The parameters are the parameters inherited from the [`PipeOpTaskPreproc`], as well as the parameters of the [`Learner`][mlr3::Learner] wrapped by this object. +#' The parameters are the parameters inherited from the [`PipeOpTaskPreproc`], as well as the parameters of the [`Learner`][mlr3::Learner] and +#' [`Resampling`][mlr3::Resampling] wrapped by this object. #' Besides that, parameters introduced are: -#' * `resampling.method` :: `character(1)`\cr -#' Which resampling method to use. Supports `"cv"`,`"bootstrap"`, `"holdout"`, `"loo"`, `"repeated_cv"`, `"subsampling"`, `"custom"` and `"insample"`. -#' See [`mlr_resamplings`][mlr3::mlr_resamplings]. -#' `"insample"` generates predictions with the model trained on all training data. -#' In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean -#' (execpt for the `"response"` in the case of a [classification Task][mlr3::TaskClassif] which is aggregated using the mode). -#' In the case of the resampling method not returning predictions for all row ids as given in the input [`Task`][mlr3::Task], these predictions are added as missing. -#' * `resampling.repeats` :: `integer(1)`\cr -#' Number of repetitions. Initialized to 30. Only used for `resampling.method = "bootstrap"`, or `"repeated_cv"`, or `"subsampling"`. -#' * `resampling.folds` :: `integer(1)`\cr -#' Number of cross validation folds. Initialized to 3. Only used for `resampling.method = "cv"`, or `"repeated_cv"`. -#' * `resampling.ratio` :: `numeric(1)`\cr -#' Ratio of observations to put into the training set. Initialized to 2/3. Only used for `resampling.method = "bootstrap"`, or `"holdout"` or `"subsampling"`. -#' * `resampling.custom.train_sets` :: `list()`\cr -#' List with row ids for training, one list element per iteration. Must have the same length as `resampling.custom.test_sets`. -#' Only used for `resampling.method = "custom"`. -#' * `resampling.custom.test_sets` :: `list()`\cr -#' List with row ids for testing, one list element per iteration. Must have the same length as `resampling.custom.train_sets`. -#' Only used for `resampling.method = "custom"`. #' * `keep_response` :: `logical(1)`\cr #' Only effective during `"prob"` prediction: Whether to keep response values, if available. Initialized to `FALSE`. #' @@ -93,6 +83,8 @@ #' [`Learner`][mlr3::Learner] that is being wrapped. Read-only. #' * `learner_model` :: [`Learner`][mlr3::Learner]\cr #' [`Learner`][mlr3::Learner] that is being wrapped. This learner contains the model if the `PipeOp` is trained. Read-only. +#' * `resampling` :: [`Resampling`][mlr3::Resampling]\cr +#' [`Resampling`][mlr3::Resampling] that is being wrapped. Read-only. #' #' @section Methods: #' Methods inherited from [`PipeOpTaskPreproc`]/[`PipeOp`]. @@ -126,33 +118,24 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", inherit = PipeOpTaskPreproc, public = list( - initialize = function(learner, id = NULL, param_vals = list()) { + initialize = function(learner, resampling = rsmp("cv", folds = 3), id = NULL, param_vals = list()) { private$.learner = as_learner(learner, clone = TRUE) private$.learner$param_set$set_id = "" + private$.resampling = as_resampling(resampling, clone = TRUE) + private$.resampling$param_set$set_id = "resampling" + id = id %??% private$.learner$id - # FIXME: can be changed when mlr-org/mlr3#470 has an answer + # FIXME: probably should restrict to only classif and regr task_type = mlr_reflections$task_types[get("type") == private$.learner$task_type][order(get("package"))][1L]$task - private$.crossval_param_set = ParamSet$new(params = list( - ParamFct$new("method", levels = c("bootstrap", "custom", "cv", "holdout", "insample", "loo", "repeated_cv", "subsampling"), tags = c("train", "required")), - ParamInt$new("repeats", lower = 1L, tags = c("train", "required")), - ParamInt$new("folds", lower = 2L, upper = Inf, tags = c("train", "required")), - ParamDbl$new("ratio", lower = 0, upper = 1, tags = c("train", "required")), - ParamLgl$new("keep_response", tags = c("train", "required")), - ParamUty$new("custom.train_sets", tags = "train", custom_check = function(x) check_list(x, types = "atomicvector", any.missing = FALSE)), - ParamUty$new("custom.test_sets", tags = "train", custom_check = function(x) check_list(x, types = "atomicvector", any.missing = FALSE)) + private$.additional_param_set = ParamSet$new(params = list( + ParamLgl$new("keep_response", tags = c("train", "required")) )) - private$.crossval_param_set$values = list(method = "cv", repeats = 30L, folds = 3L, ratio = 2 / 3, keep_response = FALSE) - private$.crossval_param_set$set_id = "resampling" - # Dependencies in paradox have been broken from the start and this is known since at least a year: - # https://github.com/mlr-org/paradox/issues/216 - # The following would make it _impossible_ to set "method" to "insample", because then "folds" - # is both _required_ (required tag above) and at the same time must be unset (because of this - # dependency). We will opt for the least annoying behaviour here and just not use dependencies - # in PipeOp ParamSets. - # private$.crossval_param_set$add_dep("folds", "method", CondEqual$new("cv")) # don't do this. + private$.additional_param_set$values = list(keep_response = FALSE) + private$.additional_param_set$set_id = "" - super$initialize(id, alist(private$.crossval_param_set, private$.learner$param_set), param_vals = param_vals, can_subset_cols = TRUE, task_type = task_type, tags = c("learner", "ensemble")) + super$initialize(id, param_set = alist(private$.resampling$param_set, private$.additional_param_set, private$.learner$param_set), + param_vals = param_vals, can_subset_cols = TRUE, task_type = task_type, tags = c("learner", "ensemble")) } ), @@ -176,6 +159,14 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", } else { multiplicity_recurse(self$state, clone_with_state, learner = private$.learner) } + }, + resampling = function(val) { + if (!missing(val)) { + if (!identical(val, private$.resampling)) { + stop("$resampling is read-only.") + } + } + private$.resampling } ), private = list( @@ -184,28 +175,12 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", # Train a learner for predicting self$state = private$.learner$train(task)$state - pv = private$.crossval_param_set$values - - if (pv$method == "insample") { - return(private$pred_to_task(as.data.table(private$.learner$predict(task)), task)) # early exit - } # Compute resampled Predictions - rdesc = mlr_resamplings$get(pv$method) - rdesc$param_set$values = switch(pv$method, - "bootstrap" = list(repeats = pv$repeats, ratio = pv$ratio), - "custom" = list(), - "cv" = list(folds = pv$folds), - "holdout" = list(ratio = pv$ratio), - "loo" = list(), - "repeated_cv" = list(repeats = pv$repeats, folds = pv$folds), - "subsampling" = list(repeats = pv$repeats, ratio = pv$ratio)) - if (pv$method == "custom") { - rdesc$instantiate(task, train_sets = private$.crossval_param_set$values$custom.train_sets, test_sets = private$.crossval_param_set$values$custom.test_sets) - } - # FIXME: we may want to instantiate here in general for safety reasons - rr = resample(task, private$.learner, rdesc) + rr = resample(task, private$.learner, private$.resampling) prds = as.data.table(rr$prediction(predict_sets = "test")) + + # Some resamplings will result in rows being sampled multiple times and some being missing nrows_multiple = length(prds$row_id[duplicated(prds$row_id)]) missing_rows = setdiff(task$row_ids, prds$row_id) nrows_missing = length(missing_rows) @@ -214,7 +189,6 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", return(private$pred_to_task(prds, task)) # early exit } - # Some resamplings will result in rows being sampled multiple times and some being missing task_type = task$task_type prds_names = colnames(prds) @@ -274,7 +248,7 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", pred_to_task = function(prds, task) { if (!is.null(prds$truth)) prds[, truth := NULL] - if (!self$param_set$values$resampling.keep_response && self$learner$predict_type == "prob") { + if (!self$param_set$values$keep_response && self$learner$predict_type == "prob") { prds[, response := NULL] } renaming = setdiff(colnames(prds), "row_id") @@ -282,8 +256,9 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", setnames(prds, "row_id", task$backend$primary_key) task$select(character(0))$cbind(prds) }, - .crossval_param_set = NULL, - .learner = NULL + .additional_param_set = NULL, + .learner = NULL, + .resampling = NULL ) ) diff --git a/man/mlr_pipeops_learner_cv.Rd b/man/mlr_pipeops_learner_cv.Rd index db0aae79c..f88e4bab0 100644 --- a/man/mlr_pipeops_learner_cv.Rd +++ b/man/mlr_pipeops_learner_cv.Rd @@ -8,7 +8,7 @@ \code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. } \description{ -Wraps an \code{\link[mlr3:Learner]{mlr3::Learner}} into a \code{\link{PipeOp}}. +Wraps a \code{\link[mlr3:Learner]{mlr3::Learner}} and \code{\link[mlr3:Resampling]{mlr3::Resampling}} into a \code{\link{PipeOp}}. Returns resampled predictions during training as a \code{\link[mlr3:Task]{Task}} and stores a model of the \code{\link[mlr3:Learner]{Learner}} trained on the whole data in \verb{$state}. This is used to create a similar @@ -19,19 +19,27 @@ The \code{\link[mlr3:Task]{Task}} gets features depending on the capsuled \code{ for \verb{$predict.type} \code{"prob"} the \verb{.prob.} features are created, and for \verb{$predict.type} \code{"se"} the new columns are \verb{.response} and \verb{.se}. \verb{} denotes the \verb{$id} of the \code{\link{PipeOpLearnerCV}} object. -Inherits the \verb{$param_set} (and therefore \verb{$param_set$values}) from the \code{\link[mlr3:Learner]{Learner}} it is constructed from. +In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean +(execpt for the \code{"response"} in the case of a \link[mlr3:TaskClassif]{classification Task} which is aggregated using the mode). +In the case of the resampling method not returning predictions for all row ids as given in the input \code{\link[mlr3:Task]{Task}}, +these predictions are added as missing. + +Inherits both the \verb{$param_set} (and therefore \verb{$param_set$values}) from the \code{\link[mlr3:Learner]{Learner}} and +\code{\link[mlr3:Resampling]{Resampling}} it is constructed from. The parameter ids of the latter one are prefixed with \code{"resampling."}. \code{\link{PipeOpLearnerCV}} can be used to create "stacking" or "super learning" \code{\link{Graph}}s that use the output of one \code{\link[mlr3:Learner]{Learner}} as features for another \code{\link[mlr3:Learner]{Learner}}. Because the \code{\link{PipeOpLearnerCV}} erases the original input features, it is often useful to use \code{\link{PipeOpFeatureUnion}} to bind the prediction \code{\link[mlr3:Task]{Task}} to the original input \code{\link[mlr3:Task]{Task}}. } \section{Construction}{ -\preformatted{PipeOpLearnerCV$new(learner, id = NULL, param_vals = list()) +\preformatted{PipeOpLearnerCV$new(learner, resampling = rsmp("cv", folds = 3), id = NULL, param_vals = list()) } \itemize{ \item \code{learner} :: \code{\link[mlr3:Learner]{Learner}} \cr \code{\link[mlr3:Learner]{Learner}} to use for resampling / prediction. -\item \code{id} :: \code{character(1)} +\item \code{resampling} :: \code{\link[mlr3:Resampling]{Resampling}} \cr +\code{\link[mlr3:Resampling]{Resamling}} to use for resampling. Initialized to 3-fold cross-validation. +\item \code{id} :: \code{character(1)}\cr Identifier of the resulting object, internally defaulting to the \code{id} of the \code{\link[mlr3:Learner]{Learner}} being wrapped. \item \code{param_vals} :: named \code{list}\cr List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default \code{list()}. @@ -47,7 +55,7 @@ type given to \code{learner} during construction; both during training and predi type given to \code{learner} during construction; both during training and prediction. The output is a task with the same target as the input task, with features replaced by predictions made by the \code{\link[mlr3:Learner]{Learner}}. -During training, this prediction is the prediction made by \code{\link[mlr3:resample]{resample}}, during prediction, this is the +During training, this prediction is the out-of-sample prediction made by \code{\link[mlr3:resample]{resample}}, during prediction, this is the ordinary prediction made on the data by a \code{\link[mlr3:Learner]{Learner}} trained on the training phase data. } @@ -71,28 +79,10 @@ Prediction time, in seconds. \section{Parameters}{ -The parameters are the parameters inherited from the \code{\link{PipeOpTaskPreproc}}, as well as the parameters of the \code{\link[mlr3:Learner]{Learner}} wrapped by this object. +The parameters are the parameters inherited from the \code{\link{PipeOpTaskPreproc}}, as well as the parameters of the \code{\link[mlr3:Learner]{Learner}} and +\code{\link[mlr3:Resampling]{Resampling}} wrapped by this object. Besides that, parameters introduced are: \itemize{ -\item \code{resampling.method} :: \code{character(1)}\cr -Which resampling method to use. Supports \code{"cv"},\code{"bootstrap"}, \code{"holdout"}, \code{"loo"}, \code{"repeated_cv"}, \code{"subsampling"}, \code{"custom"} and \code{"insample"}. -See \code{\link[mlr3:mlr_resamplings]{mlr_resamplings}}. -\code{"insample"} generates predictions with the model trained on all training data. -In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean -(execpt for the \code{"response"} in the case of a \link[mlr3:TaskClassif]{classification Task} which is aggregated using the mode). -In the case of the resampling method not returning predictions for all row ids as given in the input \code{\link[mlr3:Task]{Task}}, these predictions are added as missing. -\item \code{resampling.repeats} :: \code{integer(1)}\cr -Number of repetitions. Initialized to 30. Only used for \code{resampling.method = "bootstrap"}, or \code{"repeated_cv"}, or \code{"subsampling"}. -\item \code{resampling.folds} :: \code{integer(1)}\cr -Number of cross validation folds. Initialized to 3. Only used for \code{resampling.method = "cv"}, or \code{"repeated_cv"}. -\item \code{resampling.ratio} :: \code{numeric(1)}\cr -Ratio of observations to put into the training set. Initialized to 2/3. Only used for \code{resampling.method = "bootstrap"}, or \code{"holdout"} or \code{"subsampling"}. -\item \code{resampling.custom.train_sets} :: \code{list()}\cr -List with row ids for training, one list element per iteration. Must have the same length as \code{resampling.custom.test_sets}. -Only used for \code{resampling.method = "custom"}. -\item \code{resampling.custom.test_sets} :: \code{list()}\cr -List with row ids for testing, one list element per iteration. Must have the same length as \code{resampling.custom.train_sets}. -Only used for \code{resampling.method = "custom"}. \item \code{keep_response} :: \code{logical(1)}\cr Only effective during \code{"prob"} prediction: Whether to keep response values, if available. Initialized to \code{FALSE}. } @@ -111,6 +101,8 @@ Fields inherited from \code{\link{PipeOp}}, as well as: \code{\link[mlr3:Learner]{Learner}} that is being wrapped. Read-only. \item \code{learner_model} :: \code{\link[mlr3:Learner]{Learner}}\cr \code{\link[mlr3:Learner]{Learner}} that is being wrapped. This learner contains the model if the \code{PipeOp} is trained. Read-only. +\item \code{resampling} :: \code{\link[mlr3:Resampling]{Resampling}}\cr +\code{\link[mlr3:Resampling]{Resampling}} that is being wrapped. Read-only. } } diff --git a/tests/testthat/test_pipeop_learnercv.R b/tests/testthat/test_pipeop_learnercv.R index 038c3fc14..b64daa81b 100644 --- a/tests/testthat/test_pipeop_learnercv.R +++ b/tests/testthat/test_pipeop_learnercv.R @@ -32,7 +32,7 @@ test_that("PipeOpLearnerCV - basic properties", { list(lrn), iris_with_unambiguous_mode, predict_like_train = FALSE, deterministic_train = FALSE) # 'insample' PipeOpLearnerCV with deterministic Learner is deterministic in every regard! expect_datapreproc_pipeop_class(PipeOpLearnerCV, - list(lrn, param_vals = list(resampling.method = "insample")), iris_with_unambiguous_mode) + list(lrn, resampling = rsmp("insample")), iris_with_unambiguous_mode) expect_error(PipeOpLearnerCV$new()) @@ -41,12 +41,12 @@ test_that("PipeOpLearnerCV - basic properties", { test_that("PipeOpLearnerCV - param values", { lrn = mlr_learners$get("classif.rpart") polrn = PipeOpLearnerCV$new(lrn) - expect_subset(c("minsplit", "resampling.method", "resampling.folds"), names(polrn$param_set$params)) - expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.repeats = 30, resampling.folds = 3, resampling.ratio = 2/3, resampling.keep_response = FALSE, xval = 0)) + expect_subset(c("minsplit", "resampling.folds", "keep_response"), names(polrn$param_set$params)) + expect_equal(polrn$param_set$values, list(resampling.folds = 3, keep_response = FALSE, xval = 0)) polrn$param_set$values$minsplit = 2 - expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.repeats = 30, resampling.folds = 3, resampling.ratio = 2/3, resampling.keep_response = FALSE, minsplit = 2, xval = 0)) + expect_equal(polrn$param_set$values, list(resampling.folds = 3, keep_response = FALSE, minsplit = 2, xval = 0)) polrn$param_set$values$resampling.folds = 4 - expect_equal(polrn$param_set$values, list(resampling.method = "cv", resampling.repeats = 30, resampling.folds = 4, resampling.ratio = 2/3, resampling.keep_response = FALSE, minsplit = 2, xval = 0)) + expect_equal(polrn$param_set$values, list(resampling.folds = 4, keep_response = FALSE, minsplit = 2, xval = 0)) }) test_that("PipeOpLearnerCV - within resampling", { @@ -59,13 +59,13 @@ test_that("PipeOpLearnerCV - insample resampling", { lrn = mlr_learners$get("classif.featureless") iris_with_unambiguous_mode = mlr_tasks$get("iris")$filter(c(1:49, 52:150)) # want featureless learner without randomness - polrn = PipeOpLearnerCV$new(lrn, param_vals = list(resampling.method = "insample")) + polrn = PipeOpLearnerCV$new(lrn, rsmp("insample")) expect_equal(polrn$train(list(iris_with_unambiguous_mode))[[1]]$data(), cbind(iris_with_unambiguous_mode$data(cols = "Species"), classif.featureless.response = factor("virginica", levels = levels(iris[[5]])))) lrn = mlr_learners$get("classif.rpart") - polrn = PipeOpLearnerCV$new(lrn, param_vals = list(resampling.method = "insample")) + polrn = PipeOpLearnerCV$new(lrn, rsmp("insample")) expect_equal(polrn$train(list(iris_with_unambiguous_mode))[[1]], polrn$predict(list(iris_with_unambiguous_mode))[[1]]) }) @@ -101,9 +101,9 @@ test_that("PipeOpLearnerCV - model active binding to state", { test_that("PipeOpLearnerCV - different methods", { skip_on_cran() # takes too long + # Helper test_valid_resampled_task = function(polrn, task, predict_type) { - polrn$param_set$values$resampling.keep_response = FALSE polrn$learner$predict_type = predict_type train_out = polrn$train(list(task))[[1]] @@ -134,9 +134,6 @@ test_that("PipeOpLearnerCV - different methods", { } } - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), param_vals = list(resampling.method = "cv", resampling.folds = 2, resampling.repeats = 2)) - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), param_vals = list(resampling.method = "cv", resampling.folds = 2, resampling.repeats = 2)) - set.seed(1234) # faster training taskc = tsk("german_credit")$filter(sample(1000, 50)) @@ -144,59 +141,61 @@ test_that("PipeOpLearnerCV - different methods", { taskr = tsk("boston_housing")$filter(sample(sample(506, 50))) taskr$select("rad") - # cv (see params above) + # cv + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("cv", folds = 2)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("cv", folds = 2)) test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") test_valid_resampled_task(polrnr, taskr, "se") # bootstrap - polrnc$param_set$values$resampling.method = "bootstrap" - polrnr$param_set$values$resampling.method = "bootstrap" + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("bootstrap", repeats = 2)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("bootstrap", repeats = 2)) test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") test_valid_resampled_task(polrnr, taskr, "se") # holdout - polrnc$param_set$values$resampling.method = "holdout" - polrnr$param_set$values$resampling.method = "holdout" + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("holdout")) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("holdout")) test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") test_valid_resampled_task(polrnr, taskr, "se") # loo - polrnc$param_set$values$resampling.method = "loo" - polrnr$param_set$values$resampling.method = "loo" + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("loo")) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("loo")) test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") test_valid_resampled_task(polrnr, taskr, "se") # repeated_cv - polrnc$param_set$values$resampling.method = "repeated_cv" - polrnr$param_set$values$resampling.method = "repeated_cv" + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("repeated_cv", folds = 2, repeats = 2)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("repeated_cv", folds = 2, repeats = 2)) test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") test_valid_resampled_task(polrnr, taskr, "se") # subsampling - polrnc$param_set$values$resampling.method = "subsampling" - polrnr$param_set$values$resampling.method = "subsampling" + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("subsampling", repeats = 2)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("subsampling", repeats = 2)) test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") test_valid_resampled_task(polrnr, taskr, "se") # custom # classif - polrnc$param_set$values$resampling.method = "custom" - polrnc$param_set$values$resampling.custom.train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]) - polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]) # no multiples no missings + rcm = rsmp("custom") + rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50])) # no multiples no missings + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rcm) test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") - polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[1:50]) # multiples but no missings + rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[1:50])) # multiples but no missings test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") - polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:45]) # no multiples but missings + rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:45])) # no multiples but missings test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") polrnc$learner$predict_type = "response" @@ -206,7 +205,7 @@ test_that("PipeOpLearnerCV - different methods", { features_out = polrnc$train(list(taskc))[[1L]]$data(cols = c("classif.rpart.prob.good", "classif.rpart.prob.bad")) expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) - polrnc$param_set$values$resampling.custom.test_sets = list(taskc$row_ids[1:25], taskc$row_ids[20:45]) # multiples and missings + rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[20:45])) # multiples and missings test_valid_resampled_task(polrnc, taskc, "response") test_valid_resampled_task(polrnc, taskc, "prob") polrnc$learner$predict_type = "response" @@ -217,21 +216,21 @@ test_that("PipeOpLearnerCV - different methods", { expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) # regr - polrnr$param_set$values$resampling.method = "custom" - polrnr$param_set$values$resampling.custom.train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]) - polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]) # no multiples no missings + rcm = rsmp("custom") + rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50])) # no multiples no missings + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rcm) test_valid_resampled_task(polrnr, taskr, "se") - polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[1:50]) # multiples but no missings + rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[1:50])) # multiples but no missings test_valid_resampled_task(polrnr, taskr, "se") - polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:45]) # no multiples but missings + rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:45])) # no multiples but missings test_valid_resampled_task(polrnr, taskr, "se") polrnr$learner$predict_type = "se" features_out = polrnr$train(list(taskr))[[1L]]$data(cols = c("regr.lm.response", "regr.lm.se")) expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) - polrnr$param_set$values$resampling.custom.test_sets = list(taskr$row_ids[1:25], taskr$row_ids[20:45]) # multiples and missings + rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[20:45])) # multiples and missings test_valid_resampled_task(polrnr, taskr, "se") polrnr$learner$predict_type = "se" features_out = polrnr$train(list(taskr))[[1L]]$data(cols = c("regr.lm.response", "regr.lm.se")) From 7c3c301ac56e1f3aa4886f1bfcb1a9733c1617d4 Mon Sep 17 00:00:00 2001 From: sumny Date: Sun, 4 Oct 2020 20:32:07 +0200 Subject: [PATCH 4/8] update NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 2b03fdcb9..9abdc6366 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,6 @@ # mlr3pipelines 0.3.0-9000 +* Changed PipeOps: + - PipeOpLearnerCV now also wraps a Resampling allowing for a wider use of resampling methods # mlr3pipelines 0.3.0 From d7f89697f05bcb442f875d4f6db3ceebde6864fa Mon Sep 17 00:00:00 2001 From: sumny Date: Thu, 15 Oct 2020 17:03:28 +0200 Subject: [PATCH 5/8] fix conversion learnercv test, rerun docs --- man/mlr_pipeops_nmf.Rd | 4 ++-- tests/testthat/test_conversion.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 3df9f0433..23a039b44 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -94,8 +94,8 @@ See \code{\link[NMF:nmf]{nmf()}}. \section{Internals}{ -Uses the \code{\link[NMF:nmf]{nmf}} function as well as \code{\link[NMF:basis-coef-methods]{basis}}, \code{\link[NMF:basis-coef-methods]{coef}} and -\code{\link[MASS:ginv]{ginv}}. +Uses the \code{\link[NMF:nmf]{nmf()}} function as well as \code{\link[NMF:basis-coef-methods]{basis()}}, \code{\link[NMF:basis-coef-methods]{coef()}} and +\code{\link[MASS:ginv]{ginv()}}. } \section{Methods}{ diff --git a/tests/testthat/test_conversion.R b/tests/testthat/test_conversion.R index 68526a694..d7ce9480d 100644 --- a/tests/testthat/test_conversion.R +++ b/tests/testthat/test_conversion.R @@ -155,7 +155,7 @@ test_that("PipeOp to GraphLearner", { expect_equal(r1, r3) - po_cv = po("learner_cv", learner = po, param_vals = list(resampling.method = "insample")) + po_cv = po("learner_cv", learner = po, resampling = rsmp("insample")) expect_true("GraphLearner" %in% class(po_cv$learner)) train_out = po_cv$train(list(task)) From c7b8a3c895ec2bcc0cc829d32d313a484f398556 Mon Sep 17 00:00:00 2001 From: sumny Date: Tue, 20 Oct 2020 17:31:57 +0200 Subject: [PATCH 6/8] fix news, fix tags of resampling params, fix tests --- NEWS.md | 1 - R/PipeOpLearnerCV.R | 21 ++++++++++++++------- man/mlr_pipeops_learner_cv.Rd | 9 +++++---- tests/testthat/test_pipeop_learnercv.R | 2 +- tests/testthat/test_usecases.R | 2 +- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1ec4012d7..77180c22b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,6 @@ * Changed PipeOps: - PipeOpLearnerCV now also wraps a Resampling allowing for a wider use of resampling methods - - PipeOpNMF: now exposes all parameters previously in .options - PipeOpMissInd now also allows for setting type = integer - PipeOpNMF: now exposes all parameters previously in .options * Changed mlr_graphs: diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R index 66494fddf..d5ef530d7 100644 --- a/R/PipeOpLearnerCV.R +++ b/R/PipeOpLearnerCV.R @@ -16,13 +16,14 @@ #' for `$predict.type` `"prob"` the `.prob.` features are created, and for `$predict.type` `"se"` the new columns #' are `.response` and `.se`. `` denotes the `$id` of the [`PipeOpLearnerCV`] object. #' -#' In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean -#' (execpt for the `"response"` in the case of a [classification Task][mlr3::TaskClassif] which is aggregated using the mode). +#' In the case of the resampling method returning multiple predictions per row id, the predictions are aggregated via their mean +#' (except for the `"response"` in the case of a [classification Task][mlr3::TaskClassif] which is aggregated using the mode). #' In the case of the resampling method not returning predictions for all row ids as given in the input [`Task`][mlr3::Task], #' these predictions are added as missing. #' #' Inherits both the `$param_set` (and therefore `$param_set$values`) from the [`Learner`][mlr3::Learner] and -#' [`Resampling`][mlr3::Resampling] it is constructed from. The parameter ids of the latter one are prefixed with `"resampling."`. +#' [`Resampling`][mlr3::Resampling] it is constructed from. The parameter ids of the latter one are prefixed with `"resampling."` +#' and the tags of these parameters are extended by `"train"`. #' #' [`PipeOpLearnerCV`] can be used to create "stacking" or "super learning" [`Graph`]s that use the output of one [`Learner`][mlr3::Learner] #' as features for another [`Learner`][mlr3::Learner]. Because the [`PipeOpLearnerCV`] erases the original input features, it is often @@ -36,7 +37,7 @@ #' * `learner` :: [`Learner`][mlr3::Learner] \cr #' [`Learner`][mlr3::Learner] to use for resampling / prediction. #' * `resampling` :: [`Resampling`][mlr3::Resampling] \cr -#' [`Resamling`][mlr3::Resampling] to use for resampling. Initialized to 3-fold cross-validation. +#' [`Resampling`][mlr3::Resampling] to use for resampling. Initialized to 3-fold cross-validation. #' * `id` :: `character(1)`\cr #' Identifier of the resulting object, internally defaulting to the `id` of the [`Learner`][mlr3::Learner] being wrapped. #' * `param_vals` :: named `list`\cr @@ -124,8 +125,14 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", private$.resampling = as_resampling(resampling, clone = TRUE) private$.resampling$param_set$set_id = "resampling" + # tags of resampling parameters should include "train"; we fix this here + for (i in seq_along(private$.resampling$param_set$params)) { + private$.resampling$param_set$params[[i]]$tags = c("train", private$.resampling$param_set$params[[i]]$tags) + } + + id = id %??% private$.learner$id - # FIXME: probably should restrict to only classif and regr + # FIXME: probably should restrict to only classif and regr because of the potential aggregation being done below task_type = mlr_reflections$task_types[get("type") == private$.learner$task_type][order(get("package"))][1L]$task private$.additional_param_set = ParamSet$new(params = list( @@ -234,7 +241,7 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", prds_corrected$response = factor(prds_corrected$response, levels = levels(target), ordered = is.ordered(target)) } - # FIXME: safety cheks? + # FIXME: do we need additional safety checks here? private$pred_to_task(prds_corrected, task) }, @@ -262,7 +269,7 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", ) ) -# Helper function to add missings to predictions based on their storage mode +# helper function to add missings to predictions based on their storage mode add_missings = function(x, len) { c(x, switch(typeof(x), "character" = rep_len(NA_character_, length.out = len), diff --git a/man/mlr_pipeops_learner_cv.Rd b/man/mlr_pipeops_learner_cv.Rd index f88e4bab0..c46448579 100644 --- a/man/mlr_pipeops_learner_cv.Rd +++ b/man/mlr_pipeops_learner_cv.Rd @@ -19,13 +19,14 @@ The \code{\link[mlr3:Task]{Task}} gets features depending on the capsuled \code{ for \verb{$predict.type} \code{"prob"} the \verb{.prob.} features are created, and for \verb{$predict.type} \code{"se"} the new columns are \verb{.response} and \verb{.se}. \verb{} denotes the \verb{$id} of the \code{\link{PipeOpLearnerCV}} object. -In the case of the resampling method returing multiple predictions per row id, the predictions are aggregated via their mean -(execpt for the \code{"response"} in the case of a \link[mlr3:TaskClassif]{classification Task} which is aggregated using the mode). +In the case of the resampling method returning multiple predictions per row id, the predictions are aggregated via their mean +(except for the \code{"response"} in the case of a \link[mlr3:TaskClassif]{classification Task} which is aggregated using the mode). In the case of the resampling method not returning predictions for all row ids as given in the input \code{\link[mlr3:Task]{Task}}, these predictions are added as missing. Inherits both the \verb{$param_set} (and therefore \verb{$param_set$values}) from the \code{\link[mlr3:Learner]{Learner}} and -\code{\link[mlr3:Resampling]{Resampling}} it is constructed from. The parameter ids of the latter one are prefixed with \code{"resampling."}. +\code{\link[mlr3:Resampling]{Resampling}} it is constructed from. The parameter ids of the latter one are prefixed with \code{"resampling."} +and the tags of these parameters are extended by \code{"train"}. \code{\link{PipeOpLearnerCV}} can be used to create "stacking" or "super learning" \code{\link{Graph}}s that use the output of one \code{\link[mlr3:Learner]{Learner}} as features for another \code{\link[mlr3:Learner]{Learner}}. Because the \code{\link{PipeOpLearnerCV}} erases the original input features, it is often @@ -38,7 +39,7 @@ useful to use \code{\link{PipeOpFeatureUnion}} to bind the prediction \code{\lin \item \code{learner} :: \code{\link[mlr3:Learner]{Learner}} \cr \code{\link[mlr3:Learner]{Learner}} to use for resampling / prediction. \item \code{resampling} :: \code{\link[mlr3:Resampling]{Resampling}} \cr -\code{\link[mlr3:Resampling]{Resamling}} to use for resampling. Initialized to 3-fold cross-validation. +\code{\link[mlr3:Resampling]{Resampling}} to use for resampling. Initialized to 3-fold cross-validation. \item \code{id} :: \code{character(1)}\cr Identifier of the resulting object, internally defaulting to the \code{id} of the \code{\link[mlr3:Learner]{Learner}} being wrapped. \item \code{param_vals} :: named \code{list}\cr diff --git a/tests/testthat/test_pipeop_learnercv.R b/tests/testthat/test_pipeop_learnercv.R index 80d6bb047..67510d8ee 100644 --- a/tests/testthat/test_pipeop_learnercv.R +++ b/tests/testthat/test_pipeop_learnercv.R @@ -32,7 +32,7 @@ test_that("PipeOpLearnerCV - basic properties", { list(lrn), iris_with_unambiguous_mode, predict_like_train = FALSE, deterministic_train = FALSE, check_ps_default_values = FALSE) # 'insample' PipeOpLearnerCV with deterministic Learner is deterministic in every regard! expect_datapreproc_pipeop_class(PipeOpLearnerCV, - list(lrn, param_vals = list(resampling.method = "insample")), iris_with_unambiguous_mode, check_ps_default_values = FALSE) + list(lrn, resampling = rsmp("insample")), iris_with_unambiguous_mode, check_ps_default_values = FALSE) expect_error(PipeOpLearnerCV$new()) diff --git a/tests/testthat/test_usecases.R b/tests/testthat/test_usecases.R index 40117175b..baf2d0dfb 100644 --- a/tests/testthat/test_usecases.R +++ b/tests/testthat/test_usecases.R @@ -152,7 +152,7 @@ test_that("stacking", { pipe$pipeops$classif.rpart$learner$predict_type = "prob" pipe$pipeops$classif.featureless$learner$predict_type = "prob" - pipe$pipeops$classif.featureless$param_set$values$resampling.keep_response = TRUE + pipe$pipeops$classif.featureless$param_set$values$keep_response = TRUE result = pipe$train(task)[[1]] From 2f99db84e61214bd83507715d6ce4c4095f5150a Mon Sep 17 00:00:00 2001 From: sumny Date: Thu, 11 Mar 2021 12:32:13 +0100 Subject: [PATCH 7/8] rework --- DESCRIPTION | 3 +- NAMESPACE | 1 + NEWS.md | 5 +- R/PipeOpAggregate.R | 148 +++++++++++++++++++ R/PipeOpLearnerCV.R | 127 +++++----------- R/PipeOpTuneThreshold.R | 7 +- R/zzz.R | 3 + man/PipeOp.Rd | 1 + man/PipeOpEnsemble.Rd | 1 + man/PipeOpImpute.Rd | 1 + man/PipeOpTargetTrafo.Rd | 1 + man/PipeOpTaskPreproc.Rd | 1 + man/PipeOpTaskPreprocSimple.Rd | 1 + man/mlr_pipeops.Rd | 1 + man/mlr_pipeops_aggregate.Rd | 178 +++++++++++++++++++++++ man/mlr_pipeops_boxcox.Rd | 1 + man/mlr_pipeops_branch.Rd | 1 + man/mlr_pipeops_chunk.Rd | 1 + man/mlr_pipeops_classbalancing.Rd | 1 + man/mlr_pipeops_classifavg.Rd | 1 + man/mlr_pipeops_classweights.Rd | 1 + man/mlr_pipeops_colapply.Rd | 1 + man/mlr_pipeops_collapsefactors.Rd | 1 + man/mlr_pipeops_colroles.Rd | 1 + man/mlr_pipeops_copy.Rd | 1 + man/mlr_pipeops_datefeatures.Rd | 1 + man/mlr_pipeops_encode.Rd | 1 + man/mlr_pipeops_encodeimpact.Rd | 1 + man/mlr_pipeops_encodelmer.Rd | 1 + man/mlr_pipeops_featureunion.Rd | 1 + man/mlr_pipeops_filter.Rd | 1 + man/mlr_pipeops_fixfactors.Rd | 1 + man/mlr_pipeops_histbin.Rd | 1 + man/mlr_pipeops_ica.Rd | 1 + man/mlr_pipeops_imputeconstant.Rd | 1 + man/mlr_pipeops_imputehist.Rd | 1 + man/mlr_pipeops_imputelearner.Rd | 1 + man/mlr_pipeops_imputemean.Rd | 1 + man/mlr_pipeops_imputemedian.Rd | 1 + man/mlr_pipeops_imputemode.Rd | 1 + man/mlr_pipeops_imputeoor.Rd | 1 + man/mlr_pipeops_imputesample.Rd | 1 + man/mlr_pipeops_kernelpca.Rd | 1 + man/mlr_pipeops_learner.Rd | 1 + man/mlr_pipeops_learner_cv.Rd | 14 +- man/mlr_pipeops_missind.Rd | 1 + man/mlr_pipeops_modelmatrix.Rd | 1 + man/mlr_pipeops_multiplicityexply.Rd | 1 + man/mlr_pipeops_multiplicityimply.Rd | 1 + man/mlr_pipeops_mutate.Rd | 1 + man/mlr_pipeops_nmf.Rd | 1 + man/mlr_pipeops_nop.Rd | 1 + man/mlr_pipeops_ovrsplit.Rd | 1 + man/mlr_pipeops_ovrunite.Rd | 1 + man/mlr_pipeops_pca.Rd | 1 + man/mlr_pipeops_proxy.Rd | 1 + man/mlr_pipeops_quantilebin.Rd | 1 + man/mlr_pipeops_randomprojection.Rd | 1 + man/mlr_pipeops_randomresponse.Rd | 1 + man/mlr_pipeops_regravg.Rd | 1 + man/mlr_pipeops_removeconstants.Rd | 1 + man/mlr_pipeops_renamecolumns.Rd | 1 + man/mlr_pipeops_replicate.Rd | 1 + man/mlr_pipeops_scale.Rd | 1 + man/mlr_pipeops_scalemaxabs.Rd | 1 + man/mlr_pipeops_scalerange.Rd | 1 + man/mlr_pipeops_select.Rd | 1 + man/mlr_pipeops_smote.Rd | 1 + man/mlr_pipeops_spatialsign.Rd | 1 + man/mlr_pipeops_subsample.Rd | 1 + man/mlr_pipeops_targetinvert.Rd | 1 + man/mlr_pipeops_targetmutate.Rd | 1 + man/mlr_pipeops_targettrafoscalerange.Rd | 1 + man/mlr_pipeops_textvectorizer.Rd | 1 + man/mlr_pipeops_threshold.Rd | 1 + man/mlr_pipeops_tunethreshold.Rd | 1 + man/mlr_pipeops_unbranch.Rd | 1 + man/mlr_pipeops_updatetarget.Rd | 1 + man/mlr_pipeops_vtreat.Rd | 1 + man/mlr_pipeops_yeojohnson.Rd | 1 + tests/testthat/test_pipeop_aggregate.R | 159 ++++++++++++++++++++ tests/testthat/test_pipeop_colroles.R | 2 +- tests/testthat/test_pipeop_learnercv.R | 170 ++-------------------- 83 files changed, 635 insertions(+), 253 deletions(-) create mode 100644 R/PipeOpAggregate.R create mode 100644 man/mlr_pipeops_aggregate.Rd create mode 100644 tests/testthat/test_pipeop_aggregate.R diff --git a/DESCRIPTION b/DESCRIPTION index 9389a34fa..381391143 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,7 @@ Imports: data.table, digest, lgr, - mlr3 (>= 0.6.0), + mlr3 (>= 0.11.0), mlr3misc (>= 0.7.0), paradox, R6, @@ -102,6 +102,7 @@ Collate: 'LearnerAvg.R' 'NO_OP.R' 'PipeOpTaskPreproc.R' + 'PipeOpAggregate.R' 'PipeOpBoxCox.R' 'PipeOpBranch.R' 'PipeOpChunk.R' diff --git a/NAMESPACE b/NAMESPACE index f4c424ba8..a7cc92ea3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ export(LearnerRegrAvg) export(Multiplicity) export(NO_OP) export(PipeOp) +export(PipeOpAggregate) export(PipeOpBoxCox) export(PipeOpBranch) export(PipeOpChunk) diff --git a/NEWS.md b/NEWS.md index 9e3910f0d..d6e04b898 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,8 @@ # mlr3pipelines 0.3.4-9000 +* Changed PipeOps: + - PipeOpLearnerCV now also wraps a Resampling allowing for a wider use of resampling methods +* New PipeOps: + - PipeOpAggregate # mlr3pipelines 0.3.4 @@ -18,7 +22,6 @@ # mlr3pipelines 0.3.1 * Changed PipeOps: - - PipeOpLearnerCV now also wraps a Resampling allowing for a wider use of resampling methods - PipeOpMissInd now also allows for setting type = integer - PipeOpNMF: now exposes all parameters previously in .options * Changed mlr_graphs: diff --git a/R/PipeOpAggregate.R b/R/PipeOpAggregate.R new file mode 100644 index 000000000..0f96a10da --- /dev/null +++ b/R/PipeOpAggregate.R @@ -0,0 +1,148 @@ +#' @title Aggregate Features Row-Wise +#' +#' @usage NULL +#' @name mlr_pipeops_aggregate +#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @description +#' Aggregates features row-wise based on multiple observations indicated via a column of role `row_reference` according to expressions given as formulas. +#' Typically used after [`PipeOpLearnerCV`] and prior to [`PipeOpFeatureUnion`] if the resampling method returned multiple predictions per row id. +#' However, note that not all [`Resampling`][mlr3::Resampling] methods result in at least one prediction per original row id. +#' +#' @section Construction: +#' ``` +#' PipeOpAggregate$new(id = "aggregate", param_vals = list()) +#' ``` +#' * `id` :: `character(1)`\cr +#' Identifier of resulting object, default `"aggregate"`. +#' * `param_vals` :: named `list`\cr +#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. +#' +#' @section Input and Output Channels: +#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. +# +#' The output is a [`Task`][mlr3::Task] with the same target as the input [`Task`][mlr3::Task], with features aggregated as specified. +#' +#' @section State: +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`]. +#' +#' @section Parameters: +#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: +#' * `aggregation` :: named `list` of `formula`\cr +#' Expressions for how features should be aggregated, in the form of `formula`. +#' Each element of the list is a `formula` with the name of the element naming the feature to aggregate and the formula expression determining the result. +#' Each formula is evaluated within [`data.table`] environments of the [`Task`][mlr3::Task] that contain all features split via the `by` argument (see below). +#' Initialized to `list()`, i.e., no aggregation is performed. +#' * `by` :: `character(1)` | `NULL`\cr +#' Column indicating the `row_reference` column of the [`Task`][mlr3::Task] that should be the row-wise basis for the aggregation. +#' Initialized to `NULL`, i.e., no aggregation is performed. +#' +#' @section Internals: +#' A `formula` created using the `~` operator always contains a reference to the `environment` in which +#' the `formula` is created. This makes it possible to use variables in the `~`-expressions that both +#' reference either column names or variable names. +#' +#' @section Fields: +#' Only fields inherited from [`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @section Methods: +#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @family PipeOps +#' @seealso https://mlr3book.mlr-org.com/list-pipeops.html +#' @include PipeOpTaskPreproc.R +#' @export +#' @examples +#' library("mlr3") +#' calculate_mode = function(x) { +#' unique_x = unique(x) +#' unique_x[which.max(tabulate(match(x, unique_x)))] +#' } +#' +#' task = tsk("iris") +#' learner = lrn("classif.rpart") +#' +#' lrnloo_po = po("learner_cv", learner, rsmp("loo")) +#' nop = mlr_pipeops$get("nop") +#' agg_po = po("aggregate", +#' aggregation = list( +#' classif.rpart.response = ~ calculate_mode(classif.rpart.response) +#' ), +#' by = "pre.classif.rpart") +#' +#' graph = gunion(list( +#' lrnloo_po %>>% agg_po, +#' nop +#' )) %>>% po("featureunion") +#' +#' graph$train(task) +#' +#' graph$pipeops$classif.rpart$learner$predict_type = "prob" +#' graph$param_set$values$aggregate.aggregation = list( +#' classif.rpart.prob.setosa = ~ mean(classif.rpart.prob.setosa), +#' classif.rpart.prob.versicolor = ~ mean(classif.rpart.prob.versicolor), +#' classif.rpart.prob.virginica = ~ mean(classif.rpart.prob.virginica) +#' ) +#' graph$train(task) +PipeOpAggregate = R6Class("Aggregate", + inherit = PipeOpTaskPreprocSimple, + public = list( + initialize = function(id = "aggregate", param_vals = list()) { + ps = ParamSet$new(params = list( + ParamUty$new("aggregation", tags = c("train", "predict", "required"), custom_check = check_aggregation_formulae), + ParamUty$new("by", tags = c("train", "predict", "required"), custom_check = function(x) check_string(x, null.ok = TRUE)) + )) + ps$values = list(aggregation = list(), by = NULL) + super$initialize(id, ps, param_vals = param_vals, tags = "ensemble") + } + ), + private = list( + .transform = function(task) { + + if (length(self$param_set$values$aggregation) == 0L || is.null(self$param_set$values$by)) { + return(task) # early exit + } + + assert_set_equal(names(self$param_set$values$aggregation), task$feature_names) + assert_choice(self$param_set$values$by, choices = task$col_roles$row_reference) + + taskdata = task$data(cols = c(task$feature_names, task$col_roles$row_reference)) + taskdata_split = split(taskdata, by = self$param_set$values$by) + + newdata = unique(task$data(cols = c(task$target_names, task$col_roles$row_reference[match(task$col_roles$row_reference, self$param_set$values$by)])), by = self$param_set$values$by) + + nms = names(self$param_set$values$aggregation) + for (i in seq_along(nms)) { + frm = self$param_set$values$aggregation[[i]] + set(newdata, j = nms[i], value = unlist(map(taskdata_split, .f = function(split) eval(frm[[2L]], envir = split, enclos = environment(frm))))) + } + setnames(newdata, old = self$param_set$values$by, new = task$backend$primary_key) + + # get task_type from mlr_reflections and call constructor + constructor = get(mlr_reflections$task_types[["task"]][chmatch(task$task_type, table = mlr_reflections$task_types[["type"]], nomatch = 0L)][[1L]]) + newtask = invoke(constructor$new, id = task$id, backend = as_data_backend(newdata, primary_key = task$backend$primary_key), target = task$target_names, .args = task$extra_args) + newtask$extra_args = task$extra_args + + newtask + } + ) +) + +mlr_pipeops$add("aggregate", PipeOpAggregate) + +# check the `aggregation` parameter of PipeOpAggregate +# @param x [list] whatever `aggregation` is being set to +# checks that `aggregation` is +# * a named list of `formula` +# * that each element has only a lhs +check_aggregation_formulae = function(x) { + check_list(x, types = "formula", names = "unique") %check&&% + Reduce(`%check&&%`, lapply(x, function(xel) { + if (length(xel) != 2) { + return(sprintf("formula %s must not have a left hand side.", + deparse(xel, nlines = 1L, width.cutoff = 500L))) + } + TRUE + }), TRUE) +} + diff --git a/R/PipeOpLearnerCV.R b/R/PipeOpLearnerCV.R index 350956398..8c2c11d6a 100644 --- a/R/PipeOpLearnerCV.R +++ b/R/PipeOpLearnerCV.R @@ -1,4 +1,4 @@ -#' @title Wrap a Learner into a PipeOp with Cross-validated Predictions as Features +#' @title Wrap a Learner into a PipeOp with Resampled Predictions as Features #' #' @usage NULL #' @name mlr_pipeops_learner_cv @@ -16,10 +16,10 @@ #' for `$predict.type` `"prob"` the `.prob.` features are created, and for `$predict.type` `"se"` the new columns #' are `.response` and `.se`. `` denotes the `$id` of the [`PipeOpLearnerCV`] object. #' -#' In the case of the resampling method returning multiple predictions per row id, the predictions are aggregated via their mean -#' (except for the `"response"` in the case of a [classification Task][mlr3::TaskClassif] which is aggregated using the mode). -#' In the case of the resampling method not returning predictions for all row ids as given in the input [`Task`][mlr3::Task], -#' these predictions are added as missing. +#' In the case of the resampling method returning multiple predictions per row id, the predictions +#' are returned unaltered. The output [`Task`][mlr3::Task] always gains a `row_reference` column +#' named `pre.` indicating the original row id prior to the resampling process. [`PipeOpAggregate`] should then +#' be used to aggregate these multiple predictions per row id. #' #' Inherits both the `$param_set` (and therefore `$param_set$values`) from the [`Learner`][mlr3::Learner] and #' [`Resampling`][mlr3::Resampling] it is constructed from. The parameter ids of the latter one are prefixed with `"resampling."` @@ -50,7 +50,7 @@ #' [`PipeOpLearnerCV`] has one output channel named `"output"`, producing a [`Task`][mlr3::Task] specific to the [`Learner`][mlr3::Learner] #' type given to `learner` during construction; both during training and prediction. #' -#' The output is a task with the same target as the input task, with features replaced by predictions made by the [`Learner`][mlr3::Learner]. +#' The output is a [`Task`][mlr3::Task] with the same target as the input [`Task`][mlr3::Task], with features replaced by predictions made by the [`Learner`][mlr3::Learner]. #' During training, this prediction is the out-of-sample prediction made by [`resample`][mlr3::resample], during prediction, this is the #' ordinary prediction made on the data by a [`Learner`][mlr3::Learner] trained on the training phase data. #' @@ -101,7 +101,7 @@ #' task = tsk("iris") #' learner = lrn("classif.rpart") #' -#' lrncv_po = po("learner_cv", learner) +#' lrncv_po = po("learner_cv", learner, rsmp("cv")) #' lrncv_po$learner$predict_type = "response" #' #' nop = mlr_pipeops$get("nop") @@ -131,8 +131,7 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", } - id = id %??% private$.learner$id - # FIXME: probably should restrict to only classif and regr because of the potential aggregation being done below + id = id %??% self$learner$id task_type = mlr_reflections$task_types[get("type") == private$.learner$task_type][order(get("package"))][1L]$task private$.additional_param_set = ParamSet$new(params = list( @@ -179,93 +178,52 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", private = list( .train_task = function(task) { on.exit({private$.learner$state = NULL}) - - # Train a learner for predicting + # train a learner for predicting self$state = private$.learner$train(task)$state - # Compute resampled Predictions + # compute resampled predictions rr = resample(task, private$.learner, private$.resampling) prds = as.data.table(rr$prediction(predict_sets = "test")) - # Some resamplings will result in rows being sampled multiple times and some being missing - nrows_multiple = length(prds$row_id[duplicated(prds$row_id)]) - missing_rows = setdiff(task$row_ids, prds$row_id) - nrows_missing = length(missing_rows) - - if (!nrows_multiple && !nrows_missing) { - return(private$pred_to_task(prds, task)) # early exit - } - - task_type = task$task_type - prds_names = colnames(prds) - - prds_corrected = if (nrows_multiple) { - # classif: prob, regr: response, (se) - SDcols_multiple = setdiff(prds_names, if (task_type == "classif") c("row_id", "truth", "response") else c("row_id", "truth")) - - # aggregation functions: - # - mean for prob, response (regr), se - # - mode for response (classif) - prds_corrected = prds[, map(.SD, function(x) { - if (length(x) == 1L) return(x) # early exit - mean(x, na.rm = TRUE) - }), by = "row_id", .SDcols = SDcols_multiple] - - if (NROW(prds_corrected) == 0L) prds_corrected = unique(prds[, "row_id"]) - - if (task_type == "classif") { - cbind(prds_corrected, prds[, map(.SD, function(x) { - if (length(x) == 1L) return(as.character(x)) # early exit - tt = table(x) - names(tt[which.max(tt)]) - }), by = "row_id", .SDcols = "response"][, "response"]) - } else { - prds_corrected - } - } else { - if (task_type == "classif") { - prds[, "response" := as.character(response)] - } - prds[, !"truth"] - } - - if (nrows_missing) { - SDcols_missing = setdiff(prds_names, "truth") - # add missings - prds_corrected = prds_corrected[, map(.SD, add_missings, len = nrows_missing), .SDcols = SDcols_missing] - prds_corrected$row_id[is.na(prds_corrected$row_id)] = missing_rows - } - - if (task_type == "classif") { - target = task$truth(prds_corrected$row_id) - prds_corrected$response = factor(prds_corrected$response, levels = levels(target), ordered = is.ordered(target)) - } - - # FIXME: do we need additional safety checks here? - - private$pred_to_task(prds_corrected, task) + private$.pred_to_task(prds, task) }, .predict_task = function(task) { on.exit({private$.learner$state = NULL}) private$.learner$state = self$state - prediction = as.data.table(private$.learner$predict(task)) - private$pred_to_task(prediction, task) + prds = as.data.table(private$.learner$predict(task)) + private$.pred_to_task(prds, task) }, - pred_to_task = function(prds, task) { - if (!is.null(prds$truth)) prds[, truth := NULL] + .pred_to_task = function(prds, task) { if (!self$param_set$values$keep_response && self$learner$predict_type == "prob") { prds[, response := NULL] } - renaming = setdiff(colnames(prds), c("row_id", "row_ids")) - setnames(prds, renaming, sprintf("%s.%s", self$id, renaming)) + renaming = setdiff(colnames(prds), c("row_ids", "truth")) + setnames(prds, old = renaming, new = sprintf("%s.%s", self$id, renaming)) + setnames(prds, old = "truth", new = task$target_names) + row_reference = paste0("pre.", self$id) + while (row_reference %in% task$col_info$id) { + row_reference = paste0(row_reference, ".") + } + setnames(prds, old = "row_ids", new = row_reference) - # This can be simplified for mlr3 >= 0.11.0; - # will be always "row_ids" - row_id_col = intersect(colnames(prds), c("row_id", "row_ids")) - setnames(prds, old = row_id_col, new = task$backend$primary_key) - task$select(character(0))$cbind(prds) + # the following is needed to pertain correct row ids in the case of e.g. cv + # here we do not necessarily apply PipeOpAggregate later + backend = if (identical(sort(prds[[row_reference]]), sort(task$row_ids))) { + set(prds, j = task$backend$primary_key, value = prds[[row_reference]]) + as_data_backend(prds, primary_key = task$backend$primary_key) + } else { + as_data_backend(prds) + } + + # get task_type from mlr_reflections and call constructor + constructor = get(mlr_reflections$task_types[["task"]][chmatch(task$task_type, table = mlr_reflections$task_types[["type"]], nomatch = 0L)][[1L]]) + newtask = invoke(constructor$new, id = task$id, backend = backend, target = task$target_names, .args = task$extra_args) + newtask$extra_args = task$extra_args + newtask$set_col_roles(row_reference, "row_reference") + + newtask }, .additional_param_set = NULL, .learner = NULL, @@ -273,12 +231,5 @@ PipeOpLearnerCV = R6Class("PipeOpLearnerCV", ) ) -# helper function to add missings to predictions based on their storage mode -add_missings = function(x, len) { - c(x, switch(typeof(x), - "character" = rep_len(NA_character_, length.out = len), - "double" = rep_len(NA_real_, length.out = len), - "integer" = rep_len(NA_integer_, length.out = len))) -} - mlr_pipeops$add("learner_cv", PipeOpLearnerCV, list(R6Class("Learner", public = list(id = "learner_cv", task_type = "classif", param_set = ParamSet$new()))$new())) + diff --git a/R/PipeOpTuneThreshold.R b/R/PipeOpTuneThreshold.R index 1990c5c61..39522762a 100644 --- a/R/PipeOpTuneThreshold.R +++ b/R/PipeOpTuneThreshold.R @@ -143,7 +143,12 @@ PipeOpTuneThreshold = R6Class("PipeOpTuneThreshold", }, .task_to_prediction = function(input) { prob = as.matrix(input$data(cols = input$feature_names)) - colnames(prob) = unlist(input$levels()) + # setting the column names the following way is safer + nms = map_chr(strsplit(colnames(prob), "\\."), function(x) x[length(x)]) + if (!setequal(nms, input$levels(input$target_names)[[input$target_names]])) { + stopf("Cannot assign correct class levels to probability columns.") + } + colnames(prob) = map_chr(strsplit(colnames(prob), "\\."), function(x) x[length(x)]) PredictionClassif$new(input, row_ids = input$row_ids, truth = input$truth(), response = factor(colnames(prob)[max.col(prob, ties.method = "random")], levels = unlist(input$levels())), prob = prob) diff --git a/R/zzz.R b/R/zzz.R index 885b08e68..40afa1d20 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -15,6 +15,9 @@ register_mlr3 = function() { c("abstract", "meta", "missings", "feature selection", "imbalanced data", "data transform", "target transform", "ensemble", "robustify", "learner", "encode", "multiplicity"))) + if (!all(grepl("row_reference", x$task_col_roles))) { + x$task_col_roles = map(x$task_col_roles, function(col_roles) c(col_roles, "row_reference")) + } } .onLoad = function(libname, pkgname) { # nocov start diff --git a/man/PipeOp.Rd b/man/PipeOp.Rd index 15c71495a..b252e56e7 100644 --- a/man/PipeOp.Rd +++ b/man/PipeOp.Rd @@ -225,6 +225,7 @@ Other PipeOps: \code{\link{PipeOpTargetTrafo}}, \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/PipeOpEnsemble.Rd b/man/PipeOpEnsemble.Rd index f9dc38e0e..f7bc22365 100644 --- a/man/PipeOpEnsemble.Rd +++ b/man/PipeOpEnsemble.Rd @@ -102,6 +102,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/PipeOpImpute.Rd b/man/PipeOpImpute.Rd index 2e254b0c8..e29fcc67b 100644 --- a/man/PipeOpImpute.Rd +++ b/man/PipeOpImpute.Rd @@ -132,6 +132,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/PipeOpTargetTrafo.Rd b/man/PipeOpTargetTrafo.Rd index 9a567930c..539cfa103 100644 --- a/man/PipeOpTargetTrafo.Rd +++ b/man/PipeOpTargetTrafo.Rd @@ -143,6 +143,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/PipeOpTaskPreproc.Rd b/man/PipeOpTaskPreproc.Rd index 54d44c0bb..6b4ac96b1 100644 --- a/man/PipeOpTaskPreproc.Rd +++ b/man/PipeOpTaskPreproc.Rd @@ -192,6 +192,7 @@ Other PipeOps: \code{\link{PipeOpTargetTrafo}}, \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/PipeOpTaskPreprocSimple.Rd b/man/PipeOpTaskPreprocSimple.Rd index 73d30ad7e..7058f250a 100644 --- a/man/PipeOpTaskPreprocSimple.Rd +++ b/man/PipeOpTaskPreprocSimple.Rd @@ -135,6 +135,7 @@ Other PipeOps: \code{\link{PipeOpTargetTrafo}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops.Rd b/man/mlr_pipeops.Rd index 156975a4d..e2b3b3452 100644 --- a/man/mlr_pipeops.Rd +++ b/man/mlr_pipeops.Rd @@ -73,6 +73,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_aggregate.Rd b/man/mlr_pipeops_aggregate.Rd new file mode 100644 index 000000000..2c087840a --- /dev/null +++ b/man/mlr_pipeops_aggregate.Rd @@ -0,0 +1,178 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PipeOpAggregate.R +\name{mlr_pipeops_aggregate} +\alias{mlr_pipeops_aggregate} +\alias{PipeOpAggregate} +\title{Aggregate Features Row-Wise} +\format{ +\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} +\description{ +Aggregates features row-wise based on multiple observations indicated via a column of role \code{row_reference} according to expressions given as formulas. +Typically used after \code{\link{PipeOpLearnerCV}} and prior to \code{\link{PipeOpFeatureUnion}} if the resampling method returned multiple predictions per row id. +However, note that not all \code{\link[mlr3:Resampling]{Resampling}} methods result in at least one prediction per original row id. +} +\section{Construction}{ +\preformatted{PipeOpAggregate$new(id = "aggregate", param_vals = list()) +} +\itemize{ +\item \code{id} :: \code{character(1)}\cr +Identifier of resulting object, default \code{"aggregate"}. +\item \code{param_vals} :: named \code{list}\cr +List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default \code{list()}. +} +} + +\section{Input and Output Channels}{ + +Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. +The output is a \code{\link[mlr3:Task]{Task}} with the same target as the input \code{\link[mlr3:Task]{Task}}, with features aggregated as specified. +} + +\section{State}{ + +The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}. +} + +\section{Parameters}{ + +The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{aggregation} :: named \code{list} of \code{formula}\cr +Expressions for how features should be aggregated, in the form of \code{formula}. +Each element of the list is a \code{formula} with the name of the element naming the feature to aggregate and the formula expression determining the result. +Each formula is evaluated within \code{\link{data.table}} environments of the \code{\link[mlr3:Task]{Task}} that contain all features split via the \code{by} argument (see below). +Initialized to \code{list()}, i.e., no aggregation is performed. +\item \code{by} :: \code{character(1)} | \code{NULL}\cr +Column indicating the \code{row_reference} column of the \code{\link[mlr3:Task]{Task}} that should be the row-wise basis for the aggregation. +Initialized to \code{NULL}, i.e., no aggregation is performed. +} +} + +\section{Internals}{ + +A \code{formula} created using the \code{~} operator always contains a reference to the \code{environment} in which +the \code{formula} is created. This makes it possible to use variables in the \code{~}-expressions that both +reference either column names or variable names. +} + +\section{Fields}{ + +Only fields inherited from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} + +\section{Methods}{ + +Only methods inherited from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} + +\examples{ +library("mlr3") +calculate_mode = function(x) { + unique_x = unique(x) + unique_x[which.max(tabulate(match(x, unique_x)))] +} + +task = tsk("iris") +learner = lrn("classif.rpart") + +lrnloo_po = po("learner_cv", learner, rsmp("loo")) +nop = mlr_pipeops$get("nop") +agg_po = po("aggregate", + aggregation = list( + classif.rpart.response = ~ calculate_mode(classif.rpart.response) + ), + by = "pre.classif.rpart") + +graph = gunion(list( + lrnloo_po \%>>\% agg_po, + nop +)) \%>>\% po("featureunion") + +graph$train(task) + +graph$pipeops$classif.rpart$learner$predict_type = "prob" +graph$param_set$values$aggregate.aggregation = list( + classif.rpart.prob.setosa = ~ mean(classif.rpart.prob.setosa), + classif.rpart.prob.versicolor = ~ mean(classif.rpart.prob.versicolor), + classif.rpart.prob.virginica = ~ mean(classif.rpart.prob.virginica) +) +graph$train(task) +} +\seealso{ +https://mlr3book.mlr-org.com/list-pipeops.html + +Other PipeOps: +\code{\link{PipeOpEnsemble}}, +\code{\link{PipeOpImpute}}, +\code{\link{PipeOpTargetTrafo}}, +\code{\link{PipeOpTaskPreprocSimple}}, +\code{\link{PipeOpTaskPreproc}}, +\code{\link{PipeOp}}, +\code{\link{mlr_pipeops_boxcox}}, +\code{\link{mlr_pipeops_branch}}, +\code{\link{mlr_pipeops_chunk}}, +\code{\link{mlr_pipeops_classbalancing}}, +\code{\link{mlr_pipeops_classifavg}}, +\code{\link{mlr_pipeops_classweights}}, +\code{\link{mlr_pipeops_colapply}}, +\code{\link{mlr_pipeops_collapsefactors}}, +\code{\link{mlr_pipeops_colroles}}, +\code{\link{mlr_pipeops_copy}}, +\code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_encodeimpact}}, +\code{\link{mlr_pipeops_encodelmer}}, +\code{\link{mlr_pipeops_encode}}, +\code{\link{mlr_pipeops_featureunion}}, +\code{\link{mlr_pipeops_filter}}, +\code{\link{mlr_pipeops_fixfactors}}, +\code{\link{mlr_pipeops_histbin}}, +\code{\link{mlr_pipeops_ica}}, +\code{\link{mlr_pipeops_imputeconstant}}, +\code{\link{mlr_pipeops_imputehist}}, +\code{\link{mlr_pipeops_imputelearner}}, +\code{\link{mlr_pipeops_imputemean}}, +\code{\link{mlr_pipeops_imputemedian}}, +\code{\link{mlr_pipeops_imputemode}}, +\code{\link{mlr_pipeops_imputeoor}}, +\code{\link{mlr_pipeops_imputesample}}, +\code{\link{mlr_pipeops_kernelpca}}, +\code{\link{mlr_pipeops_learner}}, +\code{\link{mlr_pipeops_missind}}, +\code{\link{mlr_pipeops_modelmatrix}}, +\code{\link{mlr_pipeops_multiplicityexply}}, +\code{\link{mlr_pipeops_multiplicityimply}}, +\code{\link{mlr_pipeops_mutate}}, +\code{\link{mlr_pipeops_nmf}}, +\code{\link{mlr_pipeops_nop}}, +\code{\link{mlr_pipeops_ovrsplit}}, +\code{\link{mlr_pipeops_ovrunite}}, +\code{\link{mlr_pipeops_pca}}, +\code{\link{mlr_pipeops_proxy}}, +\code{\link{mlr_pipeops_quantilebin}}, +\code{\link{mlr_pipeops_randomprojection}}, +\code{\link{mlr_pipeops_randomresponse}}, +\code{\link{mlr_pipeops_regravg}}, +\code{\link{mlr_pipeops_removeconstants}}, +\code{\link{mlr_pipeops_renamecolumns}}, +\code{\link{mlr_pipeops_replicate}}, +\code{\link{mlr_pipeops_scalemaxabs}}, +\code{\link{mlr_pipeops_scalerange}}, +\code{\link{mlr_pipeops_scale}}, +\code{\link{mlr_pipeops_select}}, +\code{\link{mlr_pipeops_smote}}, +\code{\link{mlr_pipeops_spatialsign}}, +\code{\link{mlr_pipeops_subsample}}, +\code{\link{mlr_pipeops_targetinvert}}, +\code{\link{mlr_pipeops_targetmutate}}, +\code{\link{mlr_pipeops_targettrafoscalerange}}, +\code{\link{mlr_pipeops_textvectorizer}}, +\code{\link{mlr_pipeops_threshold}}, +\code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_unbranch}}, +\code{\link{mlr_pipeops_updatetarget}}, +\code{\link{mlr_pipeops_vtreat}}, +\code{\link{mlr_pipeops_yeojohnson}}, +\code{\link{mlr_pipeops}} +} +\concept{PipeOps} diff --git a/man/mlr_pipeops_boxcox.Rd b/man/mlr_pipeops_boxcox.Rd index cf7b8b976..a6d52e3f4 100644 --- a/man/mlr_pipeops_boxcox.Rd +++ b/man/mlr_pipeops_boxcox.Rd @@ -85,6 +85,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, \code{\link{mlr_pipeops_classbalancing}}, diff --git a/man/mlr_pipeops_branch.Rd b/man/mlr_pipeops_branch.Rd index 256afebab..e5242bba4 100644 --- a/man/mlr_pipeops_branch.Rd +++ b/man/mlr_pipeops_branch.Rd @@ -105,6 +105,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_chunk}}, \code{\link{mlr_pipeops_classbalancing}}, diff --git a/man/mlr_pipeops_chunk.Rd b/man/mlr_pipeops_chunk.Rd index e7dc01689..3c0787cef 100644 --- a/man/mlr_pipeops_chunk.Rd +++ b/man/mlr_pipeops_chunk.Rd @@ -84,6 +84,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_classbalancing}}, diff --git a/man/mlr_pipeops_classbalancing.Rd b/man/mlr_pipeops_classbalancing.Rd index 4e87e9ac5..3fe479b1e 100644 --- a/man/mlr_pipeops_classbalancing.Rd +++ b/man/mlr_pipeops_classbalancing.Rd @@ -125,6 +125,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_classifavg.Rd b/man/mlr_pipeops_classifavg.Rd index f9aab7eb4..e51f59e0d 100644 --- a/man/mlr_pipeops_classifavg.Rd +++ b/man/mlr_pipeops_classifavg.Rd @@ -99,6 +99,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_classweights.Rd b/man/mlr_pipeops_classweights.Rd index deed5fcb7..91fcc2282 100644 --- a/man/mlr_pipeops_classweights.Rd +++ b/man/mlr_pipeops_classweights.Rd @@ -93,6 +93,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_colapply.Rd b/man/mlr_pipeops_colapply.Rd index ec8ff0d99..fdfa50a5e 100644 --- a/man/mlr_pipeops_colapply.Rd +++ b/man/mlr_pipeops_colapply.Rd @@ -114,6 +114,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_collapsefactors.Rd b/man/mlr_pipeops_collapsefactors.Rd index 4404732c0..e06bc020b 100644 --- a/man/mlr_pipeops_collapsefactors.Rd +++ b/man/mlr_pipeops_collapsefactors.Rd @@ -81,6 +81,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_colroles.Rd b/man/mlr_pipeops_colroles.Rd index f342d33d5..89c08da05 100644 --- a/man/mlr_pipeops_colroles.Rd +++ b/man/mlr_pipeops_colroles.Rd @@ -73,6 +73,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_copy.Rd b/man/mlr_pipeops_copy.Rd index 02ae18124..3bf4aae61 100644 --- a/man/mlr_pipeops_copy.Rd +++ b/man/mlr_pipeops_copy.Rd @@ -103,6 +103,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_datefeatures.Rd b/man/mlr_pipeops_datefeatures.Rd index 5c84d7451..c35830cdc 100644 --- a/man/mlr_pipeops_datefeatures.Rd +++ b/man/mlr_pipeops_datefeatures.Rd @@ -120,6 +120,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_encode.Rd b/man/mlr_pipeops_encode.Rd index 80e336189..5ee052258 100644 --- a/man/mlr_pipeops_encode.Rd +++ b/man/mlr_pipeops_encode.Rd @@ -106,6 +106,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_encodeimpact.Rd b/man/mlr_pipeops_encodeimpact.Rd index 0be88b7da..9f2a9afc0 100644 --- a/man/mlr_pipeops_encodeimpact.Rd +++ b/man/mlr_pipeops_encodeimpact.Rd @@ -98,6 +98,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_encodelmer.Rd b/man/mlr_pipeops_encodelmer.Rd index aebf5291b..8b84935ce 100644 --- a/man/mlr_pipeops_encodelmer.Rd +++ b/man/mlr_pipeops_encodelmer.Rd @@ -109,6 +109,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_featureunion.Rd b/man/mlr_pipeops_featureunion.Rd index c99233a66..6f5c10dd3 100644 --- a/man/mlr_pipeops_featureunion.Rd +++ b/man/mlr_pipeops_featureunion.Rd @@ -118,6 +118,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_filter.Rd b/man/mlr_pipeops_filter.Rd index a87ccb638..fdb9d8ef1 100644 --- a/man/mlr_pipeops_filter.Rd +++ b/man/mlr_pipeops_filter.Rd @@ -127,6 +127,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_fixfactors.Rd b/man/mlr_pipeops_fixfactors.Rd index 66a9226da..61ebd0b39 100644 --- a/man/mlr_pipeops_fixfactors.Rd +++ b/man/mlr_pipeops_fixfactors.Rd @@ -73,6 +73,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_histbin.Rd b/man/mlr_pipeops_histbin.Rd index 7cef85cce..0eabd0a4a 100644 --- a/man/mlr_pipeops_histbin.Rd +++ b/man/mlr_pipeops_histbin.Rd @@ -85,6 +85,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_ica.Rd b/man/mlr_pipeops_ica.Rd index da0800c70..cae4243ba 100644 --- a/man/mlr_pipeops_ica.Rd +++ b/man/mlr_pipeops_ica.Rd @@ -111,6 +111,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputeconstant.Rd b/man/mlr_pipeops_imputeconstant.Rd index 4ffd9ecdb..5392bf6e8 100644 --- a/man/mlr_pipeops_imputeconstant.Rd +++ b/man/mlr_pipeops_imputeconstant.Rd @@ -87,6 +87,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputehist.Rd b/man/mlr_pipeops_imputehist.Rd index 43a3beb86..ea5dd8a94 100644 --- a/man/mlr_pipeops_imputehist.Rd +++ b/man/mlr_pipeops_imputehist.Rd @@ -72,6 +72,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputelearner.Rd b/man/mlr_pipeops_imputelearner.Rd index f86074f27..f4eada177 100644 --- a/man/mlr_pipeops_imputelearner.Rd +++ b/man/mlr_pipeops_imputelearner.Rd @@ -101,6 +101,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputemean.Rd b/man/mlr_pipeops_imputemean.Rd index 9a34246aa..15016de56 100644 --- a/man/mlr_pipeops_imputemean.Rd +++ b/man/mlr_pipeops_imputemean.Rd @@ -72,6 +72,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputemedian.Rd b/man/mlr_pipeops_imputemedian.Rd index b89c02ee3..82df3dd15 100644 --- a/man/mlr_pipeops_imputemedian.Rd +++ b/man/mlr_pipeops_imputemedian.Rd @@ -72,6 +72,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputemode.Rd b/man/mlr_pipeops_imputemode.Rd index 1ec28fc65..c82b59fe4 100644 --- a/man/mlr_pipeops_imputemode.Rd +++ b/man/mlr_pipeops_imputemode.Rd @@ -79,6 +79,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputeoor.Rd b/man/mlr_pipeops_imputeoor.Rd index c141c4d33..cf07c0d3a 100644 --- a/man/mlr_pipeops_imputeoor.Rd +++ b/man/mlr_pipeops_imputeoor.Rd @@ -101,6 +101,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_imputesample.Rd b/man/mlr_pipeops_imputesample.Rd index e31bcb461..1617527a8 100644 --- a/man/mlr_pipeops_imputesample.Rd +++ b/man/mlr_pipeops_imputesample.Rd @@ -74,6 +74,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_kernelpca.Rd b/man/mlr_pipeops_kernelpca.Rd index 1b426e65d..85ec21d15 100644 --- a/man/mlr_pipeops_kernelpca.Rd +++ b/man/mlr_pipeops_kernelpca.Rd @@ -86,6 +86,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_learner.Rd b/man/mlr_pipeops_learner.Rd index 9a5a12024..09787d973 100644 --- a/man/mlr_pipeops_learner.Rd +++ b/man/mlr_pipeops_learner.Rd @@ -105,6 +105,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_learner_cv.Rd b/man/mlr_pipeops_learner_cv.Rd index 9bcadeeec..1eb3457e4 100644 --- a/man/mlr_pipeops_learner_cv.Rd +++ b/man/mlr_pipeops_learner_cv.Rd @@ -3,7 +3,7 @@ \name{mlr_pipeops_learner_cv} \alias{mlr_pipeops_learner_cv} \alias{PipeOpLearnerCV} -\title{Wrap a Learner into a PipeOp with Cross-validated Predictions as Features} +\title{Wrap a Learner into a PipeOp with Resampled Predictions as Features} \format{ \code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. } @@ -19,10 +19,10 @@ The \code{\link[mlr3:Task]{Task}} gets features depending on the capsuled \code{ for \verb{$predict.type} \code{"prob"} the \verb{.prob.} features are created, and for \verb{$predict.type} \code{"se"} the new columns are \verb{.response} and \verb{.se}. \verb{} denotes the \verb{$id} of the \code{\link{PipeOpLearnerCV}} object. -In the case of the resampling method returning multiple predictions per row id, the predictions are aggregated via their mean -(except for the \code{"response"} in the case of a \link[mlr3:TaskClassif]{classification Task} which is aggregated using the mode). -In the case of the resampling method not returning predictions for all row ids as given in the input \code{\link[mlr3:Task]{Task}}, -these predictions are added as missing. +In the case of the resampling method returning multiple predictions per row id, the predictions +are returned unaltered. The output \code{\link[mlr3:Task]{Task}} always gains a \code{row_reference} column +named \verb{pre.} indicating the original row id prior to the resampling process. \code{\link{PipeOpAggregate}} should then +be used to aggregate these multiple predictions per row id. Inherits both the \verb{$param_set} (and therefore \verb{$param_set$values}) from the \code{\link[mlr3:Learner]{Learner}} and \code{\link[mlr3:Resampling]{Resampling}} it is constructed from. The parameter ids of the latter one are prefixed with \code{"resampling."} @@ -55,7 +55,7 @@ type given to \code{learner} during construction; both during training and predi \code{\link{PipeOpLearnerCV}} has one output channel named \code{"output"}, producing a \code{\link[mlr3:Task]{Task}} specific to the \code{\link[mlr3:Learner]{Learner}} type given to \code{learner} during construction; both during training and prediction. -The output is a task with the same target as the input task, with features replaced by predictions made by the \code{\link[mlr3:Learner]{Learner}}. +The output is a \code{\link[mlr3:Task]{Task}} with the same target as the input \code{\link[mlr3:Task]{Task}}, with features replaced by predictions made by the \code{\link[mlr3:Learner]{Learner}}. During training, this prediction is the out-of-sample prediction made by \code{\link[mlr3:resample]{resample}}, during prediction, this is the ordinary prediction made on the data by a \code{\link[mlr3:Learner]{Learner}} trained on the training phase data. } @@ -118,7 +118,7 @@ library("mlr3") task = tsk("iris") learner = lrn("classif.rpart") -lrncv_po = po("learner_cv", learner) +lrncv_po = po("learner_cv", learner, rsmp("cv")) lrncv_po$learner$predict_type = "response" nop = mlr_pipeops$get("nop") diff --git a/man/mlr_pipeops_missind.Rd b/man/mlr_pipeops_missind.Rd index 2e04a6645..00f6b7589 100644 --- a/man/mlr_pipeops_missind.Rd +++ b/man/mlr_pipeops_missind.Rd @@ -101,6 +101,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_modelmatrix.Rd b/man/mlr_pipeops_modelmatrix.Rd index a001d496c..35fcee80f 100644 --- a/man/mlr_pipeops_modelmatrix.Rd +++ b/man/mlr_pipeops_modelmatrix.Rd @@ -78,6 +78,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_multiplicityexply.Rd b/man/mlr_pipeops_multiplicityexply.Rd index bd0398108..9aa10147b 100644 --- a/man/mlr_pipeops_multiplicityexply.Rd +++ b/man/mlr_pipeops_multiplicityexply.Rd @@ -84,6 +84,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_multiplicityimply.Rd b/man/mlr_pipeops_multiplicityimply.Rd index e7fa51394..1b820c814 100644 --- a/man/mlr_pipeops_multiplicityimply.Rd +++ b/man/mlr_pipeops_multiplicityimply.Rd @@ -90,6 +90,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_mutate.Rd b/man/mlr_pipeops_mutate.Rd index d8b9aa8d4..9554ced38 100644 --- a/man/mlr_pipeops_mutate.Rd +++ b/man/mlr_pipeops_mutate.Rd @@ -95,6 +95,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 69de35de1..eb3602e6b 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -124,6 +124,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_nop.Rd b/man/mlr_pipeops_nop.Rd index 72e23ec84..29633ce14 100644 --- a/man/mlr_pipeops_nop.Rd +++ b/man/mlr_pipeops_nop.Rd @@ -80,6 +80,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_ovrsplit.Rd b/man/mlr_pipeops_ovrsplit.Rd index 7d7e62379..172815f29 100644 --- a/man/mlr_pipeops_ovrsplit.Rd +++ b/man/mlr_pipeops_ovrsplit.Rd @@ -95,6 +95,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_ovrunite.Rd b/man/mlr_pipeops_ovrunite.Rd index 4c58a76fe..64ffaff54 100644 --- a/man/mlr_pipeops_ovrunite.Rd +++ b/man/mlr_pipeops_ovrunite.Rd @@ -90,6 +90,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_pca.Rd b/man/mlr_pipeops_pca.Rd index df07ac656..a968adccd 100644 --- a/man/mlr_pipeops_pca.Rd +++ b/man/mlr_pipeops_pca.Rd @@ -89,6 +89,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_proxy.Rd b/man/mlr_pipeops_proxy.Rd index 343e40014..a110c3a7f 100644 --- a/man/mlr_pipeops_proxy.Rd +++ b/man/mlr_pipeops_proxy.Rd @@ -101,6 +101,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_quantilebin.Rd b/man/mlr_pipeops_quantilebin.Rd index 59c70c60e..113d10669 100644 --- a/man/mlr_pipeops_quantilebin.Rd +++ b/man/mlr_pipeops_quantilebin.Rd @@ -77,6 +77,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_randomprojection.Rd b/man/mlr_pipeops_randomprojection.Rd index 7567e8ef0..96dd4906d 100644 --- a/man/mlr_pipeops_randomprojection.Rd +++ b/man/mlr_pipeops_randomprojection.Rd @@ -89,6 +89,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_randomresponse.Rd b/man/mlr_pipeops_randomresponse.Rd index 557be29e7..9191ea642 100644 --- a/man/mlr_pipeops_randomresponse.Rd +++ b/man/mlr_pipeops_randomresponse.Rd @@ -104,6 +104,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_regravg.Rd b/man/mlr_pipeops_regravg.Rd index 054da76d8..f25ab5a40 100644 --- a/man/mlr_pipeops_regravg.Rd +++ b/man/mlr_pipeops_regravg.Rd @@ -90,6 +90,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_removeconstants.Rd b/man/mlr_pipeops_removeconstants.Rd index e4743aff6..e5a318c03 100644 --- a/man/mlr_pipeops_removeconstants.Rd +++ b/man/mlr_pipeops_removeconstants.Rd @@ -82,6 +82,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_renamecolumns.Rd b/man/mlr_pipeops_renamecolumns.Rd index 714611a68..299595f29 100644 --- a/man/mlr_pipeops_renamecolumns.Rd +++ b/man/mlr_pipeops_renamecolumns.Rd @@ -81,6 +81,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_replicate.Rd b/man/mlr_pipeops_replicate.Rd index 5a5a4ab15..dea415fac 100644 --- a/man/mlr_pipeops_replicate.Rd +++ b/man/mlr_pipeops_replicate.Rd @@ -74,6 +74,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_scale.Rd b/man/mlr_pipeops_scale.Rd index 1189e238b..718c68032 100644 --- a/man/mlr_pipeops_scale.Rd +++ b/man/mlr_pipeops_scale.Rd @@ -96,6 +96,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_scalemaxabs.Rd b/man/mlr_pipeops_scalemaxabs.Rd index cf765c8dc..d7c72eb6f 100644 --- a/man/mlr_pipeops_scalemaxabs.Rd +++ b/man/mlr_pipeops_scalemaxabs.Rd @@ -71,6 +71,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_scalerange.Rd b/man/mlr_pipeops_scalerange.Rd index 34c58e39d..93c2a01bb 100644 --- a/man/mlr_pipeops_scalerange.Rd +++ b/man/mlr_pipeops_scalerange.Rd @@ -76,6 +76,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_select.Rd b/man/mlr_pipeops_select.Rd index ffaf3c5a7..df47a817c 100644 --- a/man/mlr_pipeops_select.Rd +++ b/man/mlr_pipeops_select.Rd @@ -92,6 +92,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_smote.Rd b/man/mlr_pipeops_smote.Rd index c6870bda0..59fd8e0d3 100644 --- a/man/mlr_pipeops_smote.Rd +++ b/man/mlr_pipeops_smote.Rd @@ -93,6 +93,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_spatialsign.Rd b/man/mlr_pipeops_spatialsign.Rd index e8b2ee70c..eeb735863 100644 --- a/man/mlr_pipeops_spatialsign.Rd +++ b/man/mlr_pipeops_spatialsign.Rd @@ -71,6 +71,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_subsample.Rd b/man/mlr_pipeops_subsample.Rd index a66619dd4..2f4c2e5ea 100644 --- a/man/mlr_pipeops_subsample.Rd +++ b/man/mlr_pipeops_subsample.Rd @@ -86,6 +86,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_targetinvert.Rd b/man/mlr_pipeops_targetinvert.Rd index e76f0f094..33b7c9d02 100644 --- a/man/mlr_pipeops_targetinvert.Rd +++ b/man/mlr_pipeops_targetinvert.Rd @@ -71,6 +71,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_targetmutate.Rd b/man/mlr_pipeops_targetmutate.Rd index 6c4953cdb..fc437d1e3 100644 --- a/man/mlr_pipeops_targetmutate.Rd +++ b/man/mlr_pipeops_targetmutate.Rd @@ -117,6 +117,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_targettrafoscalerange.Rd b/man/mlr_pipeops_targettrafoscalerange.Rd index 53f983901..c3bf733d9 100644 --- a/man/mlr_pipeops_targettrafoscalerange.Rd +++ b/man/mlr_pipeops_targettrafoscalerange.Rd @@ -83,6 +83,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_textvectorizer.Rd b/man/mlr_pipeops_textvectorizer.Rd index fccc3503c..c392f396f 100644 --- a/man/mlr_pipeops_textvectorizer.Rd +++ b/man/mlr_pipeops_textvectorizer.Rd @@ -181,6 +181,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_threshold.Rd b/man/mlr_pipeops_threshold.Rd index 8aa23ccc0..e6129b226 100644 --- a/man/mlr_pipeops_threshold.Rd +++ b/man/mlr_pipeops_threshold.Rd @@ -76,6 +76,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_tunethreshold.Rd b/man/mlr_pipeops_tunethreshold.Rd index 56947c7ef..f51cf126b 100644 --- a/man/mlr_pipeops_tunethreshold.Rd +++ b/man/mlr_pipeops_tunethreshold.Rd @@ -97,6 +97,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_unbranch.Rd b/man/mlr_pipeops_unbranch.Rd index 8cbb4dacc..2a0f63dc7 100644 --- a/man/mlr_pipeops_unbranch.Rd +++ b/man/mlr_pipeops_unbranch.Rd @@ -83,6 +83,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_updatetarget.Rd b/man/mlr_pipeops_updatetarget.Rd index 245314651..29525b78f 100644 --- a/man/mlr_pipeops_updatetarget.Rd +++ b/man/mlr_pipeops_updatetarget.Rd @@ -96,6 +96,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_vtreat.Rd b/man/mlr_pipeops_vtreat.Rd index d2747fbcb..e45abd615 100644 --- a/man/mlr_pipeops_vtreat.Rd +++ b/man/mlr_pipeops_vtreat.Rd @@ -149,6 +149,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/man/mlr_pipeops_yeojohnson.Rd b/man/mlr_pipeops_yeojohnson.Rd index 32eb7f47c..5dba9be0b 100644 --- a/man/mlr_pipeops_yeojohnson.Rd +++ b/man/mlr_pipeops_yeojohnson.Rd @@ -86,6 +86,7 @@ Other PipeOps: \code{\link{PipeOpTaskPreprocSimple}}, \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, +\code{\link{mlr_pipeops_aggregate}}, \code{\link{mlr_pipeops_boxcox}}, \code{\link{mlr_pipeops_branch}}, \code{\link{mlr_pipeops_chunk}}, diff --git a/tests/testthat/test_pipeop_aggregate.R b/tests/testthat/test_pipeop_aggregate.R new file mode 100644 index 000000000..da6bbcf68 --- /dev/null +++ b/tests/testthat/test_pipeop_aggregate.R @@ -0,0 +1,159 @@ +context("PipeOpAggregate") + +test_that("PipeOpAggregate - basic properties", { + op = PipeOpAggregate$new() + expect_pipeop(op) + + # generic tests + task = tsk("iris") + task$select(cols = "Petal.Length") + expect_datapreproc_pipeop_class(PipeOpAggregate, task = task) + + op$param_set$values$aggregation = list(NO_DEF = ~ mean(NO_DEF)) + expect_equal(task$data(), op$train(list(task))[[1L]]$data()) + + op$param_set$values$aggregation = list() + op$param_set$values$by = "NO_DEF" + expect_equal(task$data(), op$train(list(task))[[1L]]$data()) + + op$param_set$values$aggregation = list(NO_DEF = ~ mean(NO_DEF)) + expect_error(op$train(list(task)), regexp = "Must be equal to") + op$param_set$values$aggregation = list(Petal.Length = ~ mean(Petal.Length)) + expect_error(op$train(list(task)), regexp = "Must be element of") + + # toy aggregation works + calculate_mode = function(x) { + unique_x = unique(x) + unique_x[which.max(tabulate(match(x, unique_x)))] + } + task$cbind(data.table(row_reference = rep(1:3, each = 50L))) + task$cbind(data.table(categorical = as.factor(rep(c("a", "b", "c"), 50L)))) + task$set_col_roles("row_reference", roles = "row_reference") + op$param_set$values$aggregation = list(Petal.Length = ~ mean(Petal.Length), categorical = ~ calculate_mode(categorical)) + op$param_set$values$by = "row_reference" + train_out = op$train(list(task))[[1L]] + expect_data_table(train_out$data(), nrows = 3L, ncols = 3L) + expect_equal(train_out$data(cols = "Petal.Length")[["Petal.Length"]], + aggregate(Petal.Length ~ row_reference, FUN = mean, data = task$data(cols = c(task$feature_names, task$col_roles$row_reference)))[["Petal.Length"]]) + expect_equal(train_out$data(cols = "categorical")[["categorical"]], + aggregate(categorical ~ row_reference, FUN = calculate_mode, data = task$data(cols = c(task$feature_names, task$col_roles$row_reference)))[["categorical"]]) +}) + +test_that("PipeOpLearnerCV and PipeOpAggregate- different methods", { + skip_on_cran() # takes too long + + calculate_mode = function(x) { + unique_x = unique(x) + unique_x[which.max(tabulate(match(x, unique_x)))] + } + + # helper + test_valid_resampled_task = function(polrn, poagg, task, predict_type) { + polrn$learner$predict_type = predict_type + + lrn_out = polrn$train(list(task))[[1L]] + lrn_out_data = lrn_out$data() + if (class(polrn)[[1L]] %in% c("ResamplingCV", "ResamplingInsample", "ResamplingLoo")) { + expect_identical(lrn_out$row_ids, task$row_ids) + } else { + expect_subset(lrn_out$data(cols = lrn_out$col_roles$row_reference)[[lrn_out$col_roles$row_reference]], task$row_ids) + } + + agg_out = poagg$train(list(lrn_out))[[1L]] + if (class(polrn)[[1L]] %in% c("ResamplingCV", "ResamplingInsample", "ResamplingLoo", "ResamplingRepeatedCV")) { + expect_identical(agg_out$row_ids, task$row_ids) + } else { + expect_subset(agg_out$row_ids, task$row_ids) + } + + if (task$task_type == "classif") { + if (polrn$learner$predict_type == "response") { + feature = agg_out$data(cols = grep("*.response", agg_out$feature_names, value = TRUE))[[1L]] + expect_true(is.factor(feature)) + expect_identical(task$class_names, levels(feature)) + } else { # "prob" + features = agg_out$data(cols = grep("*.prob*", agg_out$feature_names, value = TRUE)) + sums = rowSums(is.na(features)) + expect_true(all(sums == 0 | sums == NCOL(features))) # either all or none missing + features = features[sums == 0, ] + expect_true(all(apply(features, MARGIN = 2L, function(x) x >= 0 & x <= 1))) # between 0 and 1 + expect_equal(rowSums(features), rep_len(1, length.out = NROW(features))) # sum is 1 + } + } else { # "regr" + if (polrn$learner$predict_type == "response") { + feature = agg_out$data(cols = grep("*.response", agg_out$feature_names, value = TRUE))[[1L]] + expect_true(is.numeric(feature)) + } else { # "se" + features = agg_out$data(cols = grep("*.response|*.se", agg_out$feature_names, value = TRUE)) + expect_true(all(apply(features, MARGIN = 2L, is.numeric))) + } + } + } + + set.seed(1234) + # faster training + taskc = tsk("german_credit")$filter(sample(1000, 50)) + taskc$select("age") + taskr = tsk("boston_housing")$filter(sample(sample(506, 50))) + taskr$select("rad") + + poaggcr = PipeOpAggregate$new( + param_vals = list(aggregation = list(classif.rpart.response = ~ calculate_mode(classif.rpart.response)), + by = "pre.classif.rpart")) + poaggcp = PipeOpAggregate$new( + param_vals = list(aggregation = list(classif.rpart.prob.bad = ~ mean(classif.rpart.prob.bad), classif.rpart.prob.good = ~ mean(classif.rpart.prob.good)), + by = "pre.classif.rpart")) + poaggrs = PipeOpAggregate$new( + param_vals = list(aggregation = list(regr.lm.response = ~ mean(regr.lm.response), regr.lm.se = ~ mean(regr.lm.se)), + by = "pre.regr.lm")) + + # cv + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("cv", folds = 2L)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("cv", folds = 2L)) + test_valid_resampled_task(polrnc, poaggcr, taskc, "response") + test_valid_resampled_task(polrnc, poaggcp, taskc, "prob") + test_valid_resampled_task(polrnr, poaggrs, taskr, "se") + + # insample + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("insample")) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("insample")) + test_valid_resampled_task(polrnc, poaggcr, taskc, "response") + test_valid_resampled_task(polrnc, poaggcp, taskc, "prob") + test_valid_resampled_task(polrnr, poaggrs, taskr, "se") + + # bootstrap + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("bootstrap", repeats = 2L)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("bootstrap", repeats = 2L)) + test_valid_resampled_task(polrnc, poaggcr, taskc, "response") + test_valid_resampled_task(polrnc, poaggcp, taskc, "prob") + test_valid_resampled_task(polrnr, poaggrs, taskr, "se") + + # holdout + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("holdout")) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("holdout")) + test_valid_resampled_task(polrnc, poaggcr, taskc, "response") + test_valid_resampled_task(polrnc, poaggcp, taskc, "prob") + test_valid_resampled_task(polrnr, poaggrs, taskr, "se") + + # loo + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("loo")) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("loo")) + test_valid_resampled_task(polrnc, poaggcr, taskc, "response") + test_valid_resampled_task(polrnc, poaggcp, taskc, "prob") + test_valid_resampled_task(polrnr, poaggrs, taskr, "se") + + # repeated_cv + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("repeated_cv", folds = 2L, repeats = 2L)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("repeated_cv", folds = 2L, repeats = 2L)) + test_valid_resampled_task(polrnc, poaggcr, taskc, "response") + test_valid_resampled_task(polrnc, poaggcp, taskc, "prob") + test_valid_resampled_task(polrnr, poaggrs, taskr, "se") + + # subsampling + polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("subsampling", repeats = 2L)) + polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("subsampling", repeats = 2L)) + test_valid_resampled_task(polrnc, poaggcr, taskc, "response") + test_valid_resampled_task(polrnc, poaggcp, taskc, "prob") + test_valid_resampled_task(polrnr, poaggrs, taskr, "se") +}) + diff --git a/tests/testthat/test_pipeop_colroles.R b/tests/testthat/test_pipeop_colroles.R index 0257f8b8d..adab8a9cf 100644 --- a/tests/testthat/test_pipeop_colroles.R +++ b/tests/testthat/test_pipeop_colroles.R @@ -34,7 +34,7 @@ test_that("PipeOpColRoles - functionality works", { train_out = train_pipeop(op, inputs = list(task))$output expect_equal(train_out$col_roles, list(feature = c("Sepal.Length", "Sepal.Width"), target = "Species", name = "Petal.Length", - order = "Petal.Length", stratum = character(), group = character(), weight = character(), uri = character(0) + order = "Petal.Length", stratum = character(0L), group = character(0L), weight = character(0L), uri = character(0L), row_reference = character(0L) ) ) expect_equal(train_out$row_names$row_name, task$data(cols = "Petal.Length")[[1L]]) diff --git a/tests/testthat/test_pipeop_learnercv.R b/tests/testthat/test_pipeop_learnercv.R index 67510d8ee..1112f3e13 100644 --- a/tests/testthat/test_pipeop_learnercv.R +++ b/tests/testthat/test_pipeop_learnercv.R @@ -4,18 +4,18 @@ test_that("PipeOpLearnerCV - basic properties", { lrn = mlr_learners$get("classif.featureless") po = PipeOpLearnerCV$new(lrn) expect_pipeop(po$clone(), check_ps_default_values = FALSE) - expect_data_table(po$input, nrows = 1) - expect_data_table(po$output, nrows = 1) + expect_data_table(po$input, nrows = 1L) + expect_data_table(po$output, nrows = 1L) task = mlr_tasks$get("iris") - tsk = train_pipeop(po, list(task = task))[[1]] + tsk = train_pipeop(po, list(task = task))[[1L]] expect_class(tsk, "Task") expect_true(tsk$nrow == 150L) expect_true(tsk$ncol == 2L) expect_equal(task$target_names, tsk$target_names) expect_equal(task$class_names, tsk$class_names) vals = factor(unique(tsk$data(cols = tsk$feature_names)$response)) - expect_character(setdiff(vals, task$class_names), len = 0) + expect_character(setdiff(vals, task$class_names), len = 0L) tsk = predict_pipeop(po, list(task = task))[[1]] expect_class(tsk, "Task") @@ -24,35 +24,34 @@ test_that("PipeOpLearnerCV - basic properties", { expect_equal(task$target_names, tsk$target_names) expect_equal(task$class_names, tsk$class_names) vals = factor(unique(tsk$data(cols = tsk$feature_names)$response)) - expect_character(setdiff(vals, task$class_names), len = 0) + expect_character(setdiff(vals, task$class_names), len = 0L) lrn = mlr_learners$get("classif.featureless") iris_with_unambiguous_mode = mlr_tasks$get("iris")$filter(c(1:49, 52:150)) # want featureless learner without randomness expect_datapreproc_pipeop_class(PipeOpLearnerCV, - list(lrn), iris_with_unambiguous_mode, predict_like_train = FALSE, deterministic_train = FALSE, check_ps_default_values = FALSE) + list(lrn), iris_with_unambiguous_mode, predict_like_train = FALSE, deterministic_train = FALSE, affect_context_independent = FALSE, check_ps_default_values = FALSE) # 'insample' PipeOpLearnerCV with deterministic Learner is deterministic in every regard! expect_datapreproc_pipeop_class(PipeOpLearnerCV, - list(lrn, resampling = rsmp("insample")), iris_with_unambiguous_mode, check_ps_default_values = FALSE) + list(lrn, resampling = rsmp("insample")), iris_with_unambiguous_mode, affect_context_independent = FALSE, check_ps_default_values = FALSE) expect_error(PipeOpLearnerCV$new()) - }) test_that("PipeOpLearnerCV - param values", { lrn = mlr_learners$get("classif.rpart") polrn = PipeOpLearnerCV$new(lrn) expect_subset(c("minsplit", "resampling.folds", "keep_response"), names(polrn$param_set$params)) - expect_equal(polrn$param_set$values, list(resampling.folds = 3, keep_response = FALSE, xval = 0)) - polrn$param_set$values$minsplit = 2 - expect_equal(polrn$param_set$values, list(resampling.folds = 3, keep_response = FALSE, minsplit = 2, xval = 0)) - polrn$param_set$values$resampling.folds = 4 - expect_equal(polrn$param_set$values, list(resampling.folds = 4, keep_response = FALSE, minsplit = 2, xval = 0)) + expect_equal(polrn$param_set$values, list(resampling.folds = 3L, keep_response = FALSE, xval = 0)) + polrn$param_set$values$minsplit = 2L + expect_equal(polrn$param_set$values, list(resampling.folds = 3L, keep_response = FALSE, minsplit = 2L, xval = 0)) + polrn$param_set$values$resampling.folds = 4L + expect_equal(polrn$param_set$values, list(resampling.folds = 4L, keep_response = FALSE, minsplit = 2L, xval = 0)) }) test_that("PipeOpLearnerCV - within resampling", { lrn = mlr_learners$get("classif.rpart") gr = GraphLearner$new(PipeOpLearnerCV$new(lrn) %>>% po(id = "l2", lrn)) - resample(tsk("iris"), gr, rsmp("holdout")) + expect_r6(resample(tsk("iris"), gr, rsmp("holdout")), classes = "ResampleResult") }) test_that("PipeOpLearnerCV - insample resampling", { @@ -60,14 +59,14 @@ test_that("PipeOpLearnerCV - insample resampling", { iris_with_unambiguous_mode = mlr_tasks$get("iris")$filter(c(1:49, 52:150)) # want featureless learner without randomness polrn = PipeOpLearnerCV$new(lrn, rsmp("insample")) - expect_equal(polrn$train(list(iris_with_unambiguous_mode))[[1]]$data(), + expect_equal(polrn$train(list(iris_with_unambiguous_mode))[[1L]]$data(), cbind(iris_with_unambiguous_mode$data(cols = "Species"), classif.featureless.response = factor("virginica", levels = levels(iris[[5]])))) lrn = mlr_learners$get("classif.rpart") polrn = PipeOpLearnerCV$new(lrn, rsmp("insample")) - expect_equal(polrn$train(list(iris_with_unambiguous_mode))[[1]], - polrn$predict(list(iris_with_unambiguous_mode))[[1]]) + expect_equal(polrn$train(list(iris_with_unambiguous_mode))[[1L]], + polrn$predict(list(iris_with_unambiguous_mode))[[1L]]) }) test_that("PipeOpLearnerCV - graph but no id", { @@ -99,140 +98,3 @@ test_that("PipeOpLearnerCV - model active binding to state", { expect_equal(po$learner_model$state, po$state) }) -test_that("PipeOpLearnerCV - different methods", { - skip_on_cran() # takes too long - - # Helper - test_valid_resampled_task = function(polrn, task, predict_type) { - polrn$learner$predict_type = predict_type - - train_out = polrn$train(list(task))[[1]] - train_out_data = train_out$data() - expect_identical(task$row_ids, train_out$row_ids) - - if (task$task_type == "classif") { - if (polrn$learner$predict_type == "response") { - feature = train_out$data(cols = grep("*.response", train_out$feature_names, value = TRUE))[[1L]] - expect_true(is.factor(feature)) - expect_identical(task$class_names, levels(feature)) - } else { # "prob" - features = train_out$data(cols = grep("*.prob*", train_out$feature_names, value = TRUE)) - sums = rowSums(is.na(features)) - expect_true(all(sums == 0 | sums == NCOL(features))) # either all or none missing - features = features[sums == 0, ] - expect_true(all(apply(features, MARGIN = 2L, function(x) x >= 0 & x <= 1))) # between 0 and 1 - expect_equal(rowSums(features), rep_len(1, length.out = NROW(features))) # sum is 1 - } - } else { # "regr" - if (polrn$learner$predict_type == "response") { - feature = train_out$data(cols = grep("*.response", train_out$feature_names, value = TRUE))[[1L]] - expect_true(is.numeric(feature)) - } else { # "se" - features = train_out$data(cols = grep("*.response|*.se", train_out$feature_names, value = TRUE)) - expect_true(all(apply(features, MARGIN = 2L, is.numeric))) - } - } - } - - set.seed(1234) - # faster training - taskc = tsk("german_credit")$filter(sample(1000, 50)) - taskc$select("age") - taskr = tsk("boston_housing")$filter(sample(sample(506, 50))) - taskr$select("rad") - - # cv - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("cv", folds = 2)) - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("cv", folds = 2)) - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - test_valid_resampled_task(polrnr, taskr, "se") - - # bootstrap - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("bootstrap", repeats = 2)) - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("bootstrap", repeats = 2)) - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - test_valid_resampled_task(polrnr, taskr, "se") - - # holdout - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("holdout")) - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("holdout")) - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - test_valid_resampled_task(polrnr, taskr, "se") - - # loo - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("loo")) - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("loo")) - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - test_valid_resampled_task(polrnr, taskr, "se") - - # repeated_cv - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("repeated_cv", folds = 2, repeats = 2)) - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("repeated_cv", folds = 2, repeats = 2)) - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - test_valid_resampled_task(polrnr, taskr, "se") - - # subsampling - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rsmp("subsampling", repeats = 2)) - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rsmp("subsampling", repeats = 2)) - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - test_valid_resampled_task(polrnr, taskr, "se") - - # custom - # classif - rcm = rsmp("custom") - rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50])) # no multiples no missings - polrnc = PipeOpLearnerCV$new(LearnerClassifRpart$new(), rcm) - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - - rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[1:50])) # multiples but no missings - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - - rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:45])) # no multiples but missings - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - polrnc$learner$predict_type = "response" - feature_out = polrnc$train(list(taskc))[[1L]]$data(cols = "classif.rpart.response")[[1L]] - expect_true(all(which(is.na(feature_out)) == 46:50)) - polrnc$learner$predict_type = "prob" - features_out = polrnc$train(list(taskc))[[1L]]$data(cols = c("classif.rpart.prob.good", "classif.rpart.prob.bad")) - expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) - - rcm$instantiate(taskc, train_sets = list(taskc$row_ids[1:25], taskc$row_ids[26:50]), test_sets = list(taskc$row_ids[1:25], taskc$row_ids[20:45])) # multiples and missings - test_valid_resampled_task(polrnc, taskc, "response") - test_valid_resampled_task(polrnc, taskc, "prob") - polrnc$learner$predict_type = "response" - feature_out = polrnc$train(list(taskc))[[1L]]$data(cols = "classif.rpart.response")[[1L]] - expect_true(all(which(is.na(feature_out)) == 46:50)) - polrnc$learner$predict_type = "prob" - features_out = polrnc$train(list(taskc))[[1L]]$data(cols = c("classif.rpart.prob.good", "classif.rpart.prob.bad")) - expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) - - # regr - rcm = rsmp("custom") - rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50])) # no multiples no missings - polrnr = PipeOpLearnerCV$new(mlr3learners::LearnerRegrLM$new(), rcm) - test_valid_resampled_task(polrnr, taskr, "se") - - rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[1:50])) # multiples but no missings - test_valid_resampled_task(polrnr, taskr, "se") - - rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:45])) # no multiples but missings - test_valid_resampled_task(polrnr, taskr, "se") - polrnr$learner$predict_type = "se" - features_out = polrnr$train(list(taskr))[[1L]]$data(cols = c("regr.lm.response", "regr.lm.se")) - expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) - - rcm$instantiate(taskr, train_sets = list(taskr$row_ids[1:25], taskr$row_ids[26:50]), test_sets = list(taskr$row_ids[1:25], taskr$row_ids[20:45])) # multiples and missings - test_valid_resampled_task(polrnr, taskr, "se") - polrnr$learner$predict_type = "se" - features_out = polrnr$train(list(taskr))[[1L]]$data(cols = c("regr.lm.response", "regr.lm.se")) - expect_true(all(which(rowSums(is.na(features_out)) == 2L) == 46:50)) -}) From 6431bd95523295567eefbb159db0058c1775e8e2 Mon Sep 17 00:00:00 2001 From: sumny Date: Thu, 11 Mar 2021 15:16:47 +0100 Subject: [PATCH 8/8] .. --- R/PipeOpAggregate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/PipeOpAggregate.R b/R/PipeOpAggregate.R index 0f96a10da..e57b057f7 100644 --- a/R/PipeOpAggregate.R +++ b/R/PipeOpAggregate.R @@ -134,11 +134,11 @@ mlr_pipeops$add("aggregate", PipeOpAggregate) # @param x [list] whatever `aggregation` is being set to # checks that `aggregation` is # * a named list of `formula` -# * that each element has only a lhs +# * that each element has only a rhs check_aggregation_formulae = function(x) { check_list(x, types = "formula", names = "unique") %check&&% Reduce(`%check&&%`, lapply(x, function(xel) { - if (length(xel) != 2) { + if (length(xel) != 2L) { return(sprintf("formula %s must not have a left hand side.", deparse(xel, nlines = 1L, width.cutoff = 500L))) }