From 15c0fa9549a67b2372f7664bd2358943533ffc0e Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Tue, 3 Dec 2024 09:17:29 +0100 Subject: [PATCH 01/46] ... --- paper/paper.bib | 18 ++++++++++++ paper/paper.md | 76 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 000000000..43ff01a25 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,18 @@ +@Manual{R, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2019}, + url = {https://www.R-project.org/}, +} + +@article{lang2019mlr3, + title={mlr3: A modern object-oriented machine learning framework in R}, + author={Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd}, + journal={Journal of Open Source Software}, + volume={4}, + number={44}, + pages={1903}, + year={2019} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 000000000..690b6bb18 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,76 @@ +--- +title: 'mlr3extralearners: A community-driven package for integrating learners into mlr3' +tags: + - R + - machine learning + - community +authors: + - name: Sebastian Fischer + orcid: 0000-0002-9609-3197 + affiliation: "2, 3" + - name: Michel Lang + orcid: 0000-0001-9754-0393 + affiliation: "1, 2" + - name: Martin Binder + affiliation: 2 + - name: Patrick Schratz + orcid: 0000-0003-0748-6624 + affiliation: 2 + - name: Bernd Bischl + orcid: 0000-0001-6002-6980 + affiliation: "2, 3" +affiliations: + - name: TU Dortmund University + index: 1 + - name: LMU Munich + index: 2 + - name: Munich Center for Machine Learning + index: 3 +date: XXX December 2024 +bibliography: paper.bib +--- + +# Summary + +- Background on mlr3: + - unified interface for ML in R + - mention that we don't implement methods ourselves but just wrap them + - mlr3 is nothing without its learners + - mention other packgages such as mlr3learners, mlr3torch and mlr3proba + +- Comparison with other packages: + - parsnip tidymodels + - ??? + +- License of the package + +# Statement of Need + +- No ML without learners: + - people have to rewrite the same learners +- Give people the ability to contribute their own methods to the ecosystem + +# Features + +- Ease of use + - benefit from the whole ecosystem: + - annotate parameter spaces that make tuning the learner easier + - Examples for how to user learner + - preprocessing via mlr3pipelines + - tuning via mlr3tuning + +- Functional correctness + - parameter tests + - sanity tests + +- Community-driven integration of new learners: + - distinguish between mlr3learners and mlr3extralearners + - Mention tutorial on website + - Mention template-generating functions for tests and learner + +# Acknowledgements + +Sebastian Fischer is supported by the Deutsche Forschungsgemeinschaft (DFG, German Research +Foundation) – 460135501 (NFDI project MaRDI). + +# References From 52cdd101c1494b0d505118f463c9dde49a7ca951 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Tue, 3 Dec 2024 11:31:00 +0100 Subject: [PATCH 02/46] add first draft --- paper/paper.bib | 109 ++++++++++++++++++++++++++++++++++++++++++++---- paper/paper.md | 68 +++++++++++++++++++----------- 2 files changed, 144 insertions(+), 33 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 43ff01a25..4acc61d57 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,3 +1,12 @@ +@book{Bischl2024 + title = {Applied Machine Learning Using {m}lr3 in {R}}, + editor = {Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang}, + url = {https://mlr3book.mlr-org.com}, + year = {2024}, + isbn = {9781032507545}, + publisher = {CRC Press} +} + @Manual{R, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, @@ -7,12 +16,96 @@ @Manual{R url = {https://www.R-project.org/}, } -@article{lang2019mlr3, - title={mlr3: A modern object-oriented machine learning framework in R}, - author={Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd}, - journal={Journal of Open Source Software}, - volume={4}, - number={44}, - pages={1903}, - year={2019} +@article{mlr, + author = {Bernd Bischl and Michel Lang and Lars Kotthoff and Julia Schiffner and Jakob Richter and Erich Studerus and Giuseppe Casalicchio and Zachary M. Jones}, + title = {mlr: Machine Learning in R}, + journal = {Journal of Machine Learning Research}, + year = {2016}, + volume = {17}, + number = {170}, + pages = {1-5}, + url = {http://jmlr.org/papers/v17/15-066.html} +} + +@Manual{data.table, + title = {data.table: Extension of `data.frame`}, + author = {Matt Dowle and Arun Srinivasan}, + year = {2019}, + note = {R package version 1.12.6}, + url = {https://CRAN.R-project.org/package=data.table}, +} + +@article{sk-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011}, + url = {http://jmlr.org/papers/v12/pedregosa11a.html} +} + +@article{caret, + author = {Max Kuhn}, + title = {Building Predictive Models in R Using the caret Package}, + journal = {Journal of Statistical Software, Articles}, + volume = {28}, + number = {5}, + year = {2008}, + keywords = {}, + pages = {1--26}, + doi = {10.18637/jss.v028.i05}, + url = {https://www.jstatsoft.org/v028/i05} } + +@article{weka, + title = {{The WEKA Data Mining Software: An Update}}, + author = {Hall, M. and Frank, E. and Holmes, G. and Pfahringer, B. and Reutemann, P. and Witten, I. H. }, + journal = {ACM SIGKDD explorations newsletter}, + volume = {11}, + number = {1}, + pages = {10--18}, + year = {2009}, + publisher = {ACM}, + doi = {10.1145/1656274.1656278} +} + +@Manual{tidymodels, + title = {tidymodels: Easily Install and Load the 'Tidymodels' Packages}, + author = {Max Kuhn and Hadley Wickham}, + year = {2019}, + note = {R package version 0.0.3}, + url = {https://CRAN.R-project.org/package=tidymodels}, +} + +@software{mlj, + author = {Anthony Blaom and Franz Kiraly and Thibaut Lienart and Sebastian Vollmer}, + title = {alan-turing-institute/MLJ.jl: v0.5.3}, + month = nov, + year = 2019, + publisher = {Zenodo}, + version = {v0.5.3}, + doi = {10.5281/zenodo.3541506}, + url = {https://doi.org/10.5281/zenodo.3541506} +} + +@article{checkmate, + author = {Michel Lang}, + title = {{checkmate: Fast Argument Checks for Defensive R Programming}}, + year = {2017}, + journal = {{The R Journal}}, + doi = {10.32614/RJ-2017-028}, + pages = {437--445}, + volume = {9}, + number = {1} +} + + @Manual{paradox, + title = {paradox: Define and Work with Parameter Spaces for Complex Algorithms}, + author = {Michel Lang and Bernd Bischl and Jakob Richter and Xudong Sun and Martin Binder}, + year = {2024}, + note = {R package version 1.0.1}, + url = {https://CRAN.R-project.org/package=paradox}, diff --git a/paper/paper.md b/paper/paper.md index 690b6bb18..e8239864a 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -4,6 +4,7 @@ tags: - R - machine learning - community + - FAIR authors: - name: Sebastian Fischer orcid: 0000-0002-9609-3197 @@ -32,41 +33,58 @@ bibliography: paper.bib # Summary -- Background on mlr3: - - unified interface for ML in R - - mention that we don't implement methods ourselves but just wrap them - - mlr3 is nothing without its learners - - mention other packgages such as mlr3learners, mlr3torch and mlr3proba +The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@mlr3] ecosystem. +The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024]. +At its core, the package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. +The `mlr3extralearners` package currently wraps 85 different learning algorithms from many different R packages, making these methods immediately accessible to `mlr3` users. +An overview of all `mlr3` learners, including those from `mlr3extralearners`, is given in the [mlr3 website](https://mlr3learners.mlr-org.com/). +Furthermore, the package also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. +In addition to making these learners available to `mlr3` users, integrating learners into `mlr3extralearners` also annotates them with extensive metadata about their parameter space, predict types and other capabilities. +Furthermore, `mlr3extralearners` verifies the correctness of learners by regularly running sanity checks on the learner, as well as verifying that the parameter space is up to date with the latest version of the package implementing the algorithm. +In order to allow the package to also include learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). - Comparison with other packages: - parsnip tidymodels - ??? -- License of the package - # Statement of Need -- No ML without learners: - - people have to rewrite the same learners -- Give people the ability to contribute their own methods to the ecosystem +In order to solve modeling problems using machine learning, one often has specific requirements for the learning algorithm such as performance, interpretability, or the ability to handle specific data types. +For this reason, it is essential for the `mlr3` ecosystem to offer a wide variety of learners, such that users can choose the most appropriate learner for their specific problem. +While connecting a new learner to `mlr3` is straightforward and can be done on a per-need basis, integrating learners into `mlr3extralearners` also makes this available to other users and avoids replication of effort. +Furthermore, contributing to `mlr3extralearners` also has the added benefits that the learners are reviewed by the maintainers of the package, ensuring that they are correct and work as expected. + +Besides the advantage for users of machine learning methods, `mlr3extralearners` also offers benefits for package developers. +After developing a new R package that implements a machine learning algorithm, making it available in the `mlr3` ecosystem means that the learning algorithm is immediately integrated into the wider ecosystem and can therefore easily be tuned or combined with preprocessing steps. # Features -- Ease of use - - benefit from the whole ecosystem: - - annotate parameter spaces that make tuning the learner easier - - Examples for how to user learner - - preprocessing via mlr3pipelines - - tuning via mlr3tuning - -- Functional correctness - - parameter tests - - sanity tests - -- Community-driven integration of new learners: - - distinguish between mlr3learners and mlr3extralearners - - Mention tutorial on website - - Mention template-generating functions for tests and learner +The core functionality of `mlr3extralearners` is to integrate new learners into the `mlr3` ecosystem. +By doing so, many different learning algorithms can be used with the same syntax and standardized interface. +However, the benefits of `mlr3extralearners` do not stop at mere integration. + +## Metadata + +One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. +For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. +Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier tuning of the hyperparameters. +Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have. +The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values. + +## Functional Correctness + +One problem that comes with wrapping learning algorithms from different R packages is that their API can change. +The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was wrapped. +In `mlr3extralearners`, we regularly check whether the learner implements the expected interface of the upstream function and update the parameter set accordingly. + +In addition to this `mlr3`-specific check, `mlr3extralearners` also verifies the correctness of learners by regularly running automatic tests on the learners. +These tests perform simple sanity checks and also verify that the learner's metadata is correctly annotated, e.g. that a learner that claims to be able to handle missing values actually does so. + +## Templates for new Learners + +In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the learner itself, as well as associated test files. +These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user. +The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html). # Acknowledgements From 434296cb2a8451921ad83861d964dd9daeeedc05 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 10:48:02 +0100 Subject: [PATCH 03/46] add paper workflow --- .github/workflows/draft-pdf.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/draft-pdf.yml diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 000000000..2f7d2ed72 --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,28 @@ +name: Draft PDF +on: + push: + paths: + - paper/** + - .github/workflows/draft-pdf.yml + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper.pdf From 916adb2c47a3af2f35a38698c1cf1f1f8ce4e24b Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 10:53:42 +0100 Subject: [PATCH 04/46] correct path --- .github/workflows/draft-pdf.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml index 2f7d2ed72..1dd444eed 100644 --- a/.github/workflows/draft-pdf.yml +++ b/.github/workflows/draft-pdf.yml @@ -17,7 +17,7 @@ jobs: with: journal: joss # This should be the path to the paper within your repo. - paper-path: paper.md + paper-path: paper/paper.md - name: Upload uses: actions/upload-artifact@v4 with: @@ -25,4 +25,4 @@ jobs: # This is the output path where Pandoc will write the compiled # PDF. Note, this should be the same directory as the input # paper.md - path: paper.pdf + path: paper/paper.pdf From a95551380907b70fd9cc67edbb337c7c30cf5a68 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 11:03:50 +0100 Subject: [PATCH 05/46] fix bib file --- paper/paper.bib | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 4acc61d57..5d7fe443e 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,4 +1,4 @@ -@book{Bischl2024 +@book{Bischl2024, title = {Applied Machine Learning Using {m}lr3 in {R}}, editor = {Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang}, url = {https://mlr3book.mlr-org.com}, @@ -103,9 +103,10 @@ @article{checkmate number = {1} } - @Manual{paradox, - title = {paradox: Define and Work with Parameter Spaces for Complex Algorithms}, - author = {Michel Lang and Bernd Bischl and Jakob Richter and Xudong Sun and Martin Binder}, - year = {2024}, - note = {R package version 1.0.1}, - url = {https://CRAN.R-project.org/package=paradox}, +@Manual{paradox, + title = {paradox: Define and Work with Parameter Spaces for Complex Algorithms}, + author = {Michel Lang and Bernd Bischl and Jakob Richter and Xudong Sun and Martin Binder}, + year = {2024}, + note = {R package version 1.0.1}, + url = {https://CRAN.R-project.org/package=paradox} +} From a054f0ab7fb32a09f66c06836cd152fda7a65b60 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 11:39:08 +0100 Subject: [PATCH 06/46] refine paper --- paper/paper.bib | 26 ++++++++++++++++++++++++++ paper/paper.md | 28 +++++++++++++++------------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 5d7fe443e..26ef107db 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -7,6 +7,21 @@ @book{Bischl2024 publisher = {CRC Press} } +@article{Lang2019, + author = {Lang, Michel and Binder, Martin and Richter, Jakob and Schratz, Patrick and Pfisterer, Florian and Coors, Stefan and Au, Quay and Casalicchio, Giuseppe and Kotthoff, Lars and Bischl, Bernd}, + doi = {10.21105/JOSS.01903}, + issn = {2475-9066}, + journal = {Journal of Open Source Software}, + month = {dec}, + number = {44}, + pages = {1903}, + publisher = {The Open Journal}, + title = {{mlr3: A modern object-oriented machine learning framework in R}}, + url = {https://joss.theoj.org/papers/10.21105/joss.01903}, + volume = {4}, + year = {2019} +} + @Manual{R, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, @@ -110,3 +125,14 @@ @Manual{paradox note = {R package version 1.0.1}, url = {https://CRAN.R-project.org/package=paradox} } + +@article{mlr3pipelines2021, + author = {Binder, Martin and Pfisterer, Florian and Lang, Michel and Schneider, Lennart and Kotthoff, Lars and Bischl, Bernd}, + journal = {Journal of Machine Learning Research}, + number = {184}, + pages = {1--7}, + title = {{mlr3pipelines - Flexible Machine Learning Pipelines in R}}, + url = {http://jmlr.org/papers/v22/21-0281.html}, + volume = {22}, + year = {2021} +} diff --git a/paper/paper.md b/paper/paper.md index e8239864a..d1b5394c7 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -33,14 +33,16 @@ bibliography: paper.bib # Summary -The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@mlr3] ecosystem. +The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@Lang2019] ecosystem. The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024]. -At its core, the package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. -The `mlr3extralearners` package currently wraps 85 different learning algorithms from many different R packages, making these methods immediately accessible to `mlr3` users. -An overview of all `mlr3` learners, including those from `mlr3extralearners`, is given in the [mlr3 website](https://mlr3learners.mlr-org.com/). -Furthermore, the package also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. -In addition to making these learners available to `mlr3` users, integrating learners into `mlr3extralearners` also annotates them with extensive metadata about their parameter space, predict types and other capabilities. -Furthermore, `mlr3extralearners` verifies the correctness of learners by regularly running sanity checks on the learner, as well as verifying that the parameter space is up to date with the latest version of the package implementing the algorithm. +At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. +The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. +This enables `mlr3` users to seamlessly access and utilize these learners directly within their workflows. +An overview of all `mlr3` learners, including those from `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html). + +Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. +This enriches each learner with extensive metadata about its parameter space, prediction types, and other key attributes. +Furthermore, `mlr3extralearners` includes robust mechanisms for quality assurance, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. In order to allow the package to also include learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). - Comparison with other packages: @@ -49,13 +51,13 @@ In order to allow the package to also include learners that are not available on # Statement of Need -In order to solve modeling problems using machine learning, one often has specific requirements for the learning algorithm such as performance, interpretability, or the ability to handle specific data types. +In order to solve modeling problems using machine learning, one often has specific requirements for the learning algorithm such as performance, interpretability, or the ability to handle specific data types and modeling tasks. For this reason, it is essential for the `mlr3` ecosystem to offer a wide variety of learners, such that users can choose the most appropriate learner for their specific problem. While connecting a new learner to `mlr3` is straightforward and can be done on a per-need basis, integrating learners into `mlr3extralearners` also makes this available to other users and avoids replication of effort. Furthermore, contributing to `mlr3extralearners` also has the added benefits that the learners are reviewed by the maintainers of the package, ensuring that they are correct and work as expected. Besides the advantage for users of machine learning methods, `mlr3extralearners` also offers benefits for package developers. -After developing a new R package that implements a machine learning algorithm, making it available in the `mlr3` ecosystem means that the learning algorithm is immediately integrated into the wider ecosystem and can therefore easily be tuned or combined with preprocessing steps. +After developing a new R package that implements a machine learning algorithm, making it available in the `mlr3` ecosystem means that the learning algorithm is immediately integrated into the wider ecosystem and can therefore easily be tuned or combined with preprocessing steps [@mlr3pipelines2021]. # Features @@ -67,14 +69,14 @@ However, the benefits of `mlr3extralearners` do not stop at mere integration. One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. -Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier tuning of the hyperparameters. +Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier parameter tuning. Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have. The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values. ## Functional Correctness -One problem that comes with wrapping learning algorithms from different R packages is that their API can change. -The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was wrapped. +One problem that manifests when integrating learning algorithms from different R packages is that their API can change. +The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was integrated. In `mlr3extralearners`, we regularly check whether the learner implements the expected interface of the upstream function and update the parameter set accordingly. In addition to this `mlr3`-specific check, `mlr3extralearners` also verifies the correctness of learners by regularly running automatic tests on the learners. @@ -84,7 +86,7 @@ These tests perform simple sanity checks and also verify that the learner's meta In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the learner itself, as well as associated test files. These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user. -The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html). +The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered by several contributors. # Acknowledgements From 691f487b1862b54a1282c863e2b874c758401b8d Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 15:20:19 +0100 Subject: [PATCH 07/46] add authors --- paper/paper.bib | 15 +++++++++++ paper/paper.md | 66 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 5 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 26ef107db..5441ca7dd 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -136,3 +136,18 @@ @article{mlr3pipelines2021 volume = {22}, year = {2021} } + +@article{Sonabend2021, + author = {Sonabend, Raphael and Kir{\'{a}}ly, Franz J. and Bender, Andreas and Bischl, Bernd and Lang, Michel}, + doi = {10.1093/BIOINFORMATICS/BTAB039}, + issn = {1367-4803}, + journal = {Bioinformatics}, + month = {sep}, + number = {17}, + pages = {2789--2791}, + publisher = {Oxford Academic}, + title = {{mlr3proba: an R package for machine learning in survival analysis}}, + url = {https://academic.oup.com/bioinformatics/article/37/17/2789/6125361}, + volume = {37}, + year = {2021} +} diff --git a/paper/paper.md b/paper/paper.md index d1b5394c7..5df01f530 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -9,24 +9,79 @@ authors: - name: Sebastian Fischer orcid: 0000-0002-9609-3197 affiliation: "2, 3" + - name: John Zobolas + orcid: 0000-0002-3609-8674 + affiliation: 4 + - name: Raphael Sonabend + orcid: 0000-0001-9225-4654 + - name: Marc Becker + orcid: 0000-0002-8115-0400 + affiliation: 2 - name: Michel Lang orcid: 0000-0001-9754-0393 affiliation: "1, 2" - name: Martin Binder affiliation: 2 + - name: Lennart Schneider + orchid: 0000-0003-4152-5308 + affiliation: 2 + - name: Lukas Burk + orchid: 0000-0001-7528-3795 + affiliation: "2, 3" - name: Patrick Schratz orcid: 0000-0003-0748-6624 affiliation: 2 + - name: Byron C. Jaeger + orchid: 0000-0001-7399-2299 + affiliation: + - name: Stephen A. Lauer + orchid: + affiliation: 7 + - name: Lorenz A. Kapsner + orchid: + affiliation: 8 + - name: Maximilian Mücke + orchid: 0009-0000-9432-9795 + affiliation: 2 + - name: Zezhi Wang + orchid: + affiliation: 9 + - name: Keenan Ganz + orchid: 0000-0002-8486-3959 + affiliation: 10 + - name: Henri Funk + orchid: 0009-0007-0949-8385 + affiliation: + - name: Philipp Kopper + orchid: 0000-0002-5037-7135 + affiliation: 3 + - name: Andreas Bender + orchid: 0000-0001-5628-8611 + affiliation: "2, 3" - name: Bernd Bischl orcid: 0000-0001-6002-6980 - affiliation: "2, 3" + affiliation: "2, 3, 5, 6" affiliations: - - name: TU Dortmund University + - name: TU Dortmund University, Germany index: 1 - - name: LMU Munich + - name: Department of Statistics, LMU Munich, Germany index: 2 - - name: Munich Center for Machine Learning + - name: Munich Center for Machine Learning (MCML), Germany index: 3 + - name: Department of Cancer Genetics, Institute for Cancer Research, Oslo University Hospital, Norway + index: 4 + - name: Leibniz Institute for Prevention Research and Epidemiology (BIPS), Bremen, Germany + index: 5 + - name: Faculty of Mathematics and Computer Science, University of Bremen, Germany + index: 6 + - name: Certilytics, Inc., Louisville, Kentucky + index: 7 + - name: Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU), Erlangen, Germany + index: 8 + - name: Department of Statistics and Finance/International Institute of Finance, School of Management, University of Science and Technology of China, Hefei, Anhui, China + index: 9 + - name: School of Environmental and Forest Sciences, University of Washington, Seattle + index: 10 date: XXX December 2024 bibliography: paper.bib --- @@ -70,7 +125,7 @@ However, the benefits of `mlr3extralearners` do not stop at mere integration. One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier parameter tuning. -Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have. +Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis [@Sonabend2021]) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have. The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values. ## Functional Correctness @@ -92,5 +147,6 @@ The package website contains an [extensive tutorioal](https://mlr3extralearners. Sebastian Fischer is supported by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) – 460135501 (NFDI project MaRDI). +John Zobolas received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 101016851, project PANCAIM. # References From 21f797f3cf83b6a127901d7e1cd4c5f5777554b7 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 15:21:06 +0100 Subject: [PATCH 08/46] correct author affiliation --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 5df01f530..5263c62d9 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -24,7 +24,7 @@ authors: affiliation: 2 - name: Lennart Schneider orchid: 0000-0003-4152-5308 - affiliation: 2 + affiliation: "2, 3" - name: Lukas Burk orchid: 0000-0001-7528-3795 affiliation: "2, 3" From cf2ae97f77169c08b10be241b37b7b87a102cff4 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 15:46:55 +0100 Subject: [PATCH 09/46] refine paper --- paper/paper.bib | 9 +++++++++ paper/paper.md | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 5441ca7dd..9c058264a 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -151,3 +151,12 @@ @article{Sonabend2021 volume = {37}, year = {2021} } + +@incollection{benchlargescale, + author = "Sebastian Fischer and Michel Lang and Marc Becker", + title = "Large-Scale Benchmarking", + booktitle = "Applied Machine Learning Using {m}lr3 in {R}", + publisher = "CRC Press", year = "2024", + editor = "Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang", + url = "https://mlr3book.mlr-org.com/large-scale_benchmarking.html" +} diff --git a/paper/paper.md b/paper/paper.md index 5263c62d9..6bac58871 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -92,12 +92,12 @@ The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https:// The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024]. At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. -This enables `mlr3` users to seamlessly access and utilize these learners directly within their workflows. +This enables `mlr3` users to seamlessly access and utilize these learners directly within their workflows as well as execute large-scale empirical benchmark experiments [@benchlargescale]. An overview of all `mlr3` learners, including those from `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html). Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. -This enriches each learner with extensive metadata about its parameter space, prediction types, and other key attributes. -Furthermore, `mlr3extralearners` includes robust mechanisms for quality assurance, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. +This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. +Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. In order to allow the package to also include learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). - Comparison with other packages: From 940e7ba91bf0551fffebd133ccc4396e0147167f Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 16:02:53 +0100 Subject: [PATCH 10/46] add author info --- paper/paper.bib | 7 +++---- paper/paper.md | 6 +++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 9c058264a..7a2994ab3 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -89,11 +89,10 @@ @article{weka } @Manual{tidymodels, - title = {tidymodels: Easily Install and Load the 'Tidymodels' Packages}, + title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.}, author = {Max Kuhn and Hadley Wickham}, - year = {2019}, - note = {R package version 0.0.3}, - url = {https://CRAN.R-project.org/package=tidymodels}, + url = {https://www.tidymodels.org}, + year = {2020}, } @software{mlj, diff --git a/paper/paper.md b/paper/paper.md index 6bac58871..f169221d1 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -51,7 +51,7 @@ authors: affiliation: 10 - name: Henri Funk orchid: 0009-0007-0949-8385 - affiliation: + affiliation: "3, 11, 12" - name: Philipp Kopper orchid: 0000-0002-5037-7135 affiliation: 3 @@ -82,6 +82,10 @@ affiliations: index: 9 - name: School of Environmental and Forest Sciences, University of Washington, Seattle index: 10 + - name: Department of Geography, LMU Munich, Germany + index: 11 + - name: Statistical Consulting Unit StaBLab, LMU Munich, Germany + index: 12 date: XXX December 2024 bibliography: paper.bib --- From 8972c9c62ae6f0bd538beda443061929f61ded59 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 16:21:02 +0100 Subject: [PATCH 11/46] add Byron's affiliation and orchid --- paper/paper.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index f169221d1..832051c1a 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -33,7 +33,7 @@ authors: affiliation: 2 - name: Byron C. Jaeger orchid: 0000-0001-7399-2299 - affiliation: + affiliation: 13 - name: Stephen A. Lauer orchid: affiliation: 7 @@ -86,6 +86,8 @@ affiliations: index: 11 - name: Statistical Consulting Unit StaBLab, LMU Munich, Germany index: 12 + - name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health SciencesWinston-Salem, North Carolina + index: 13 date: XXX December 2024 bibliography: paper.bib --- From 2629ec1a447897caac87fab072caaaae9af98993 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 16:38:18 +0100 Subject: [PATCH 12/46] refine text --- paper/paper.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 832051c1a..348aeb3b8 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -98,13 +98,13 @@ The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https:// The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024]. At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. -This enables `mlr3` users to seamlessly access and utilize these learners directly within their workflows as well as execute large-scale empirical benchmark experiments [@benchlargescale]. +This enables users to seamlessly access and utilize these learners directly within their workflows as well as execute large-scale empirical benchmark experiments [@benchlargescale], leveraging the `mlr3` framework's parallelization and optimization capabilities. An overview of all `mlr3` learners, including those from `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html). Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. -In order to allow the package to also include learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). +In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). - Comparison with other packages: - parsnip tidymodels @@ -145,7 +145,7 @@ These tests perform simple sanity checks and also verify that the learner's meta ## Templates for new Learners -In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the learner itself, as well as associated test files. +In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the new learner itself, as well as associated test files. These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user. The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered by several contributors. From b348075a5d71e85e360ab6aba1af91041ad93670 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 17:15:21 +0100 Subject: [PATCH 13/46] refine paper --- paper/paper.bib | 27 ++++++++++++++++++++++++++- paper/paper.md | 35 ++++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 7a2994ab3..262ba84ad 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -92,7 +92,32 @@ @Manual{tidymodels title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.}, author = {Max Kuhn and Hadley Wickham}, url = {https://www.tidymodels.org}, - year = {2020}, + year = {2020} +} + +@Manual{parsnip, + title = {parsnip: A Common API to Modeling and Analysis Functions}, + author = {Max Kuhn and Davis Vaughan}, + year = {2024}, + note = {R package version 1.2.1, https://parsnip.tidymodels.org/}, + url = {https://github.com/tidymodels/parsnip} +} + +@Manual{mlr3tuning, + title = {mlr3tuning: Hyperparameter Optimization for 'mlr3'} + author = {Becker, Marc and Lang, Michel and Richter, Jakob and Bischl, Bernd and Schalk, Daniel}, + year = {2024}, + note = {R package version 1.2.1, https://mlr3tuning.mlr-org.com/}, + url = {https://github.com/mlr-org/mlr3tuning} +} + +@article{caret, + title={Building predictive models in R using the caret package}, + author={Kuhn, Max}, + journal={Journal of statistical software}, + volume={28}, + pages={1--26}, + year={2008} } @software{mlj, diff --git a/paper/paper.md b/paper/paper.md index 348aeb3b8..d56d86d9f 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -35,7 +35,7 @@ authors: orchid: 0000-0001-7399-2299 affiliation: 13 - name: Stephen A. Lauer - orchid: + orchid: 0000-0003-2948-630X affiliation: 7 - name: Lorenz A. Kapsner orchid: @@ -74,7 +74,7 @@ affiliations: index: 5 - name: Faculty of Mathematics and Computer Science, University of Bremen, Germany index: 6 - - name: Certilytics, Inc., Louisville, Kentucky + - name: Certilytics, Inc., 9200 Shelbyville Rd, Louisville, KY, 40222, USA index: 7 - name: Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU), Erlangen, Germany index: 8 @@ -98,27 +98,32 @@ The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https:// The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024]. At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. -This enables users to seamlessly access and utilize these learners directly within their workflows as well as execute large-scale empirical benchmark experiments [@benchlargescale], leveraging the `mlr3` framework's parallelization and optimization capabilities. -An overview of all `mlr3` learners, including those from `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html). +This enables users to seamlessly access and utilize these learners directly within their workflows. +It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale]. +An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html). Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). -- Comparison with other packages: - - parsnip tidymodels - - ??? - # Statement of Need -In order to solve modeling problems using machine learning, one often has specific requirements for the learning algorithm such as performance, interpretability, or the ability to handle specific data types and modeling tasks. -For this reason, it is essential for the `mlr3` ecosystem to offer a wide variety of learners, such that users can choose the most appropriate learner for their specific problem. -While connecting a new learner to `mlr3` is straightforward and can be done on a per-need basis, integrating learners into `mlr3extralearners` also makes this available to other users and avoids replication of effort. -Furthermore, contributing to `mlr3extralearners` also has the added benefits that the learners are reviewed by the maintainers of the package, ensuring that they are correct and work as expected. - -Besides the advantage for users of machine learning methods, `mlr3extralearners` also offers benefits for package developers. -After developing a new R package that implements a machine learning algorithm, making it available in the `mlr3` ecosystem means that the learning algorithm is immediately integrated into the wider ecosystem and can therefore easily be tuned or combined with preprocessing steps [@mlr3pipelines2021]. +Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as performance, interpretability, or compatibility with specific data types and tasks. +To address this challenge, packages like `caret` [@caret] and `parsnip` [@parsnip] from the `tidymodels` ecosystem have historically provided unified interfaces for simplifying model experimentation [@tidymodels]. +For instance, `parsnip` provides a clean and consistent way to define models, enabling users to experiment with different algorithms without dealing with the nuances of underlying package syntax. +Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for machine learning in R. + +Within this ecosystem, `mlr3extralearners` plays a crucial role by providing a comprehensive collection of external machine learning algorithms integrated into the `mlr3` framework. +This ensures that users can access a wide variety of learners to meet their specific needs, +and choose the most appropriate learner for their specific problem. +While connecting new learners to `mlr3` is straightforward and can be done on a per-need basis, integrating them into `mlr3extralearners` benefits the broader community by avoiding redundant effort and ensuring accessibility for all users. +Additionally, contributions to `mlr3extralearners` are reviewed by the package maintainers, providing a layer of quality assurance. +This review process ensures that integrated learners function as expected and adhere to the high standards of the `mlr3` ecosystem. + +Beyond its utility for users, `mlr3extralearners` also offers significant advantages for developers of machine learning packages. +By integrating a new algorithm into the `mlr3` ecosystem, developers can immediately make their methods accessible to a wider audience. +This integration facilitates seamless tuning [@mlr3tuning] and preprocessing [@mlr3pipelines2021] through the broader `mlr3` framework, enhancing the usability and impact of their work. # Features From ed967e5d571ba988700e723c62aa76b56fb45fff Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 17:23:47 +0100 Subject: [PATCH 14/46] fix bib file --- paper/paper.bib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index 262ba84ad..a31678a73 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -104,7 +104,7 @@ @Manual{parsnip } @Manual{mlr3tuning, - title = {mlr3tuning: Hyperparameter Optimization for 'mlr3'} + title = {mlr3tuning: Hyperparameter Optimization for 'mlr3'}, author = {Becker, Marc and Lang, Michel and Richter, Jakob and Bischl, Bernd and Schalk, Daniel}, year = {2024}, note = {R package version 1.2.1, https://mlr3tuning.mlr-org.com/}, From e3225a85afa36b9c6cdb86d4d788cac189de475e Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 22:58:29 +0100 Subject: [PATCH 15/46] refine text in rest of the paper sections --- paper/paper.md | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index d56d86d9f..1515d022b 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -38,7 +38,7 @@ authors: orchid: 0000-0003-2948-630X affiliation: 7 - name: Lorenz A. Kapsner - orchid: + orchid: 0000-0003-1866-860X affiliation: 8 - name: Maximilian Mücke orchid: 0009-0000-9432-9795 @@ -119,7 +119,7 @@ This ensures that users can access a wide variety of learners to meet their spec and choose the most appropriate learner for their specific problem. While connecting new learners to `mlr3` is straightforward and can be done on a per-need basis, integrating them into `mlr3extralearners` benefits the broader community by avoiding redundant effort and ensuring accessibility for all users. Additionally, contributions to `mlr3extralearners` are reviewed by the package maintainers, providing a layer of quality assurance. -This review process ensures that integrated learners function as expected and adhere to the high standards of the `mlr3` ecosystem. +This review process ensures that integrated learners work as expected and adhere to the high standards of the `mlr3` ecosystem. Beyond its utility for users, `mlr3extralearners` also offers significant advantages for developers of machine learning packages. By integrating a new algorithm into the `mlr3` ecosystem, developers can immediately make their methods accessible to a wider audience. @@ -127,32 +127,37 @@ This integration facilitates seamless tuning [@mlr3tuning] and preprocessing [@m # Features -The core functionality of `mlr3extralearners` is to integrate new learners into the `mlr3` ecosystem. -By doing so, many different learning algorithms can be used with the same syntax and standardized interface. -However, the benefits of `mlr3extralearners` do not stop at mere integration. +The core functionality of `mlr3extralearners` is to integrate new learners into the `mlr3` ecosystem, allowing users to access a wide array of learning algorithms through a unified syntax and standardized interface. +However, the advantages of `mlr3extralearners` go well beyond simple integration. ## Metadata One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. -For one, the parameter spaces of learners are defined as parameter sets as defined in the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. -Parameters are explicitly typed, their ranges or list of available values are annotated and this information is used to both check for valid configurations, but also allow for easier parameter tuning. -Furthermore, learners are annotated with respect to their task type (such as classification, regression or survival analysis [@Sonabend2021]) and predict type (such as probabilities or class predictions), which feature types they can handle, and which capabilities they have. -The latter are standardized via a set of standardized properties, which e.g. includes the ability to do feature selection, to assign importance scores to features, or to handle missing values. + +- **Parameter Management**: The parameter spaces of learners are defined using parameter sets from the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. +Each parameter is explicitly typed, with annotations for valid ranges and allowable values. +This ensures valid configurations and simplifies tasks like parameter tuning. +- **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or class predictions). +This allows users to easily identify suitable learners for their specific modeling tasks. +- **Standardized Properties**: Learners are also annotated with properties such as the feature types they can process, and whether they support functionalities such as feature selection, importance scoring, handling missing values or whether they can track performance during training (validation). +This allows users to have a clear understanding of a learner's capabilities and limitations and assess if it aligns with the specific requirements of their workflows, reducing trial-and-error and streamlining the modeling process. ## Functional Correctness -One problem that manifests when integrating learning algorithms from different R packages is that their API can change. -The most frequent case is that new parameters are added, which were not present in the version of the package when the learner was integrated. -In `mlr3extralearners`, we regularly check whether the learner implements the expected interface of the upstream function and update the parameter set accordingly. +Integrating learners from diverse R packages poses challenges, particularly due to changes in upstream APIs. `mlr3extralearners` addresses these issues through rigorous checks: + +- **Interface Consistency**: The package regularly verifies that each learner adheres to the expected interface of its upstream function. +When new parameters are introduced or existing ones change, the tests fail until the parameter sets are updated accordingly. +- **Automated Testing**: To ensure correctness, `mlr3extralearners` performs regular automated tests on all learners. +These tests include sanity checks and validate metadata annotations, such as verifying that a learner claiming to handle missing values works as expected. -In addition to this `mlr3`-specific check, `mlr3extralearners` also verifies the correctness of learners by regularly running automatic tests on the learners. -These tests perform simple sanity checks and also verify that the learner's metadata is correctly annotated, e.g. that a learner that claims to be able to handle missing values actually does so. +## Simplified Integration of New Learners -## Templates for new Learners +To streamline the addition of new learners, `mlr3extralearners` provides robust support tools: -In order to make the integration of new learners into `mlr3extralearners` as easy as possible, we provide templates for generating code for both the new learner itself, as well as associated test files. -These templates can easily be created via an `R` function that takes in the metadata of the learner and generates files that fill out as much as possible and clearly indicate what is needed to be added by the user. -The package website contains an [extensive tutorioal](https://mlr3extralearners.mlr-org.com/articles/extending.html) on how to do this, as well as a list with [common mistakes](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered by several contributors. +- **Code Templates**: Predefined templates are available for generating the necessary code for both the learner implementation and associated test files. +These templates are generated through an `R` function that uses learner metadata to prefill as much information as possible, leaving only minimal input required from the user. +- **Guides and Resources**: The package website contains an contains an [extensive tutorial](https://mlr3extralearners.mlr-org.com/articles/extending.html), as well as a curated list of [common issues](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered during learner integration, making the process accessible for contributors of all experience levels. # Acknowledgements From cb8ed9f711a56651da8c0650b01c0c46de821308 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 23:08:12 +0100 Subject: [PATCH 16/46] add more author info --- paper/paper.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 1515d022b..89dd74167 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -34,7 +34,7 @@ authors: - name: Byron C. Jaeger orchid: 0000-0001-7399-2299 affiliation: 13 - - name: Stephen A. Lauer + - name: Stephen A Lauer orchid: 0000-0003-2948-630X affiliation: 7 - name: Lorenz A. Kapsner @@ -44,8 +44,11 @@ authors: orchid: 0009-0000-9432-9795 affiliation: 2 - name: Zezhi Wang - orchid: + orchid: 0000-0001-6988-5853 affiliation: 9 + - name: Damir Pulatov + orchid: 0000-0003-4901-7201 + affiliation: 14 - name: Keenan Ganz orchid: 0000-0002-8486-3959 affiliation: 10 @@ -88,6 +91,8 @@ affiliations: index: 12 - name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health SciencesWinston-Salem, North Carolina index: 13 + - name: University of North Carolina Wilmington + index: 14 date: XXX December 2024 bibliography: paper.bib --- From 72f4975804da3c0ced5fcd4254ceec7358055717 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 23:09:17 +0100 Subject: [PATCH 17/46] change to author github name for abess learners --- R/learner_abess_classif_abess.R | 2 +- R/learner_abess_regr_abess.R | 2 +- man/mlr_learners_classif.abess.Rd | 2 +- man/mlr_learners_regr.abess.Rd | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/learner_abess_classif_abess.R b/R/learner_abess_classif_abess.R index a1cdd5195..e5cb7752d 100644 --- a/R/learner_abess_classif_abess.R +++ b/R/learner_abess_classif_abess.R @@ -1,5 +1,5 @@ #' @title Classification Abess Learner -#' @author abess-team +#' @author bbayukari #' @name mlr_learners_classif.abess #' #' @description diff --git a/R/learner_abess_regr_abess.R b/R/learner_abess_regr_abess.R index 7a5801db4..eeaccc876 100644 --- a/R/learner_abess_regr_abess.R +++ b/R/learner_abess_regr_abess.R @@ -1,5 +1,5 @@ #' @title Regression Abess Learner -#' @author abess-team +#' @author bbayukari #' @name mlr_learners_regr.abess #' #' @description diff --git a/man/mlr_learners_classif.abess.Rd b/man/mlr_learners_classif.abess.Rd index 722cbc3d8..5074cc029 100644 --- a/man/mlr_learners_classif.abess.Rd +++ b/man/mlr_learners_classif.abess.Rd @@ -104,7 +104,7 @@ predictions$score() } } \author{ -abess-team +bbayukari } \section{Super classes}{ \code{\link[mlr3:Learner]{mlr3::Learner}} -> \code{\link[mlr3:LearnerClassif]{mlr3::LearnerClassif}} -> \code{LearnerClassifAbess} diff --git a/man/mlr_learners_regr.abess.Rd b/man/mlr_learners_regr.abess.Rd index 09e27b809..33f3e3bbc 100644 --- a/man/mlr_learners_regr.abess.Rd +++ b/man/mlr_learners_regr.abess.Rd @@ -104,7 +104,7 @@ predictions$score() } } \author{ -abess-team +bbayukari } \section{Super classes}{ \code{\link[mlr3:Learner]{mlr3::Learner}} -> \code{\link[mlr3:LearnerRegr]{mlr3::LearnerRegr}} -> \code{LearnerRegrAbess} From ed37bb467ff2e01851291a6283c0a6015824b4ce Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 23:16:46 +0100 Subject: [PATCH 18/46] refine text --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 89dd74167..7a609c9fd 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -144,7 +144,7 @@ Each parameter is explicitly typed, with annotations for valid ranges and allowa This ensures valid configurations and simplifies tasks like parameter tuning. - **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or class predictions). This allows users to easily identify suitable learners for their specific modeling tasks. -- **Standardized Properties**: Learners are also annotated with properties such as the feature types they can process, and whether they support functionalities such as feature selection, importance scoring, handling missing values or whether they can track performance during training (validation). +- **Standardized Properties**: Learners are annotated with detailed attributes, including the types of features they can process and their support for functionalities such as feature selection such as feature selection, importance scoring, handling missing values, or monitoring performance on a separate validation set during training among others. This allows users to have a clear understanding of a learner's capabilities and limitations and assess if it aligns with the specific requirements of their workflows, reducing trial-and-error and streamlining the modeling process. ## Functional Correctness From 458444f4edf7522bf1fc1b203b86bb642ab5ca3e Mon Sep 17 00:00:00 2001 From: john Date: Mon, 9 Dec 2024 23:20:54 +0100 Subject: [PATCH 19/46] mention examples --- paper/paper.md | 1 + 1 file changed, 1 insertion(+) diff --git a/paper/paper.md b/paper/paper.md index 7a609c9fd..c6666e51c 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -163,6 +163,7 @@ To streamline the addition of new learners, `mlr3extralearners` provides robust - **Code Templates**: Predefined templates are available for generating the necessary code for both the learner implementation and associated test files. These templates are generated through an `R` function that uses learner metadata to prefill as much information as possible, leaving only minimal input required from the user. - **Guides and Resources**: The package website contains an contains an [extensive tutorial](https://mlr3extralearners.mlr-org.com/articles/extending.html), as well as a curated list of [common issues](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered during learner integration, making the process accessible for contributors of all experience levels. +Additionally, every integrated learner includes a simple example of usage in the documentation, ensuring that users can quickly understand how to utilize the learner effectively within the `mlr3` ecosystem. # Acknowledgements From 35bf2cbeae8b979697b826af9935d5cdded6ded1 Mon Sep 17 00:00:00 2001 From: john Date: Tue, 10 Dec 2024 11:06:23 +0100 Subject: [PATCH 20/46] correct some author info --- paper/paper.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index c6666e51c..a2ff92ddc 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -14,9 +14,10 @@ authors: affiliation: 4 - name: Raphael Sonabend orcid: 0000-0001-9225-4654 + affiliation: "15, 16" - name: Marc Becker orcid: 0000-0002-8115-0400 - affiliation: 2 + affiliation: "2, 3"" - name: Michel Lang orcid: 0000-0001-9754-0393 affiliation: "1, 2" @@ -27,7 +28,7 @@ authors: affiliation: "2, 3" - name: Lukas Burk orchid: 0000-0001-7528-3795 - affiliation: "2, 3" + affiliation: "2, 3, 4, 5" - name: Patrick Schratz orcid: 0000-0003-0748-6624 affiliation: 2 @@ -63,7 +64,7 @@ authors: affiliation: "2, 3" - name: Bernd Bischl orcid: 0000-0001-6002-6980 - affiliation: "2, 3, 5, 6" + affiliation: "2, 3" affiliations: - name: TU Dortmund University, Germany index: 1 @@ -93,6 +94,10 @@ affiliations: index: 13 - name: University of North Carolina Wilmington index: 14 + - name: OSPO Now + index: 15 + - name: Imperial College London + index: 16 date: XXX December 2024 bibliography: paper.bib --- From a8bb61e31448c15672c5d8c4c95c0800d2505f6f Mon Sep 17 00:00:00 2001 From: john Date: Tue, 10 Dec 2024 11:12:42 +0100 Subject: [PATCH 21/46] correct authors + add last two --- paper/paper.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index a2ff92ddc..639d86864 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -17,7 +17,7 @@ authors: affiliation: "15, 16" - name: Marc Becker orcid: 0000-0002-8115-0400 - affiliation: "2, 3"" + affiliation: "2, 3" - name: Michel Lang orcid: 0000-0001-9754-0393 affiliation: "1, 2" @@ -56,6 +56,8 @@ authors: - name: Henri Funk orchid: 0009-0007-0949-8385 affiliation: "3, 11, 12" + - name: Liana Harutyunyan + - name: Pierre Camilleri - name: Philipp Kopper orchid: 0000-0002-5037-7135 affiliation: 3 From 0c3b4848261645a9254ff1c6cd2d03d1d3c09d92 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Wed, 11 Dec 2024 14:27:53 +0000 Subject: [PATCH 22/46] address some feedback --- paper/paper.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 639d86864..5a71c1f64 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,10 +1,11 @@ --- -title: 'mlr3extralearners: A community-driven package for integrating learners into mlr3' +title: 'mlr3extralearners: Expanding the mlr3 Ecosystem with Community-Driven Learner Integration' tags: - R - machine learning - community - FAIR + - benchmarking authors: - name: Sebastian Fischer orcid: 0000-0002-9609-3197 From 85917f0c17f88ac3a5634b1e27ebc53eb38d5036 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Wed, 11 Dec 2024 16:16:04 +0000 Subject: [PATCH 23/46] ... --- paper/paper.bib | 12 ++++++++++++ paper/paper.md | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index a31678a73..c9228bf3c 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -184,3 +184,15 @@ @incollection{benchlargescale editor = "Bernd Bischl and Raphael Sonabend and Lars Kotthoff and Michel Lang", url = "https://mlr3book.mlr-org.com/large-scale_benchmarking.html" } + +@article{wilkinson2016fair, + title={The FAIR Guiding Principles for scientific data management and stewardship}, + author={Wilkinson, Mark D and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and others}, + journal={Scientific data}, + volume={3}, + number={1}, + pages={1--9}, + year={2016}, + publisher={Nature Publishing Group} +} + diff --git a/paper/paper.md b/paper/paper.md index 5a71c1f64..cfe02b5a8 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -119,6 +119,7 @@ Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package d This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). +By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the R ecosystem [#wilkinson2016fair]). # Statement of Need @@ -162,7 +163,8 @@ Integrating learners from diverse R packages poses challenges, particularly due - **Interface Consistency**: The package regularly verifies that each learner adheres to the expected interface of its upstream function. When new parameters are introduced or existing ones change, the tests fail until the parameter sets are updated accordingly. - **Automated Testing**: To ensure correctness, `mlr3extralearners` performs regular automated tests on all learners. -These tests include sanity checks and validate metadata annotations, such as verifying that a learner claiming to handle missing values works as expected. +These tests include sanity checks that, e.g., verify that the learners produce sensible predictions for simple tasks. +Furthermore, the tests also validate the learners' metadata annotations, such as whether a learner can actually handle missing values or is able to produce importance scores. ## Simplified Integration of New Learners From 68b6973cc2544f40cd598aa7686567b05c82ed87 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 12:15:41 +0100 Subject: [PATCH 24/46] add community impact + future directions section --- paper/paper.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index cfe02b5a8..11288ffea 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -119,7 +119,7 @@ Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package d This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). -By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the R ecosystem [#wilkinson2016fair]). +By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the R ecosystem [@wilkinson2016fair]). # Statement of Need @@ -171,10 +171,19 @@ Furthermore, the tests also validate the learners' metadata annotations, such as To streamline the addition of new learners, `mlr3extralearners` provides robust support tools: - **Code Templates**: Predefined templates are available for generating the necessary code for both the learner implementation and associated test files. -These templates are generated through an `R` function that uses learner metadata to prefill as much information as possible, leaving only minimal input required from the user. +These templates are generated through an `R` function that uses learner metadata to prefill as much information as possible, leaving only minimal input required from the contributor. - **Guides and Resources**: The package website contains an contains an [extensive tutorial](https://mlr3extralearners.mlr-org.com/articles/extending.html), as well as a curated list of [common issues](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered during learner integration, making the process accessible for contributors of all experience levels. Additionally, every integrated learner includes a simple example of usage in the documentation, ensuring that users can quickly understand how to utilize the learner effectively within the `mlr3` ecosystem. +# Community Impact and Future Directions + +`mlr3extralearners` is a direct result of the contributions from a diverse community of authors and developers. +The authors of this paper themselves have been actively involved in integrating learners, providing quality assurance, and maintaining the package's infrastructure. +Their contributions, such as the addition of learners for specialized tasks like survival analysis and high-dimensional data, highlight the impact that thoughtful integration has on the `mlr3` ecosystem. +This ongoing effort illustrates the transformative potential of **community-driven development**, ensuring that `mlr3extralearners` continues to grow as a dynamic and inclusive repository for cutting-edge machine learning algorithms. + +By fostering a spirit of collaboration, the `mlr3extralearners` project invites future contributors to follow this example, helping shape the package's evolution and making advanced machine learning tools accessible to a wider `R` audience. + # Acknowledgements Sebastian Fischer is supported by the Deutsche Forschungsgemeinschaft (DFG, German Research From 92cef0ad8022f80ca7c894f08664a247226c3bb3 Mon Sep 17 00:00:00 2001 From: john Date: Thu, 12 Dec 2024 23:35:40 +0100 Subject: [PATCH 25/46] update author info --- paper/paper.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paper/paper.md b/paper/paper.md index 11288ffea..5c6919028 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -59,6 +59,8 @@ authors: affiliation: "3, 11, 12" - name: Liana Harutyunyan - name: Pierre Camilleri + orchid: 0009-0005-1070-0670 + affiliation: 17 - name: Philipp Kopper orchid: 0000-0002-5037-7135 affiliation: 3 @@ -101,6 +103,8 @@ affiliations: index: 15 - name: Imperial College London index: 16 + - name: multi, 8 passage Brûlon, 75012 PARIS, France + index: 17 date: XXX December 2024 bibliography: paper.bib --- From 7f6a842c9e1be15a01dab0c0b03f30cb816457c2 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 16 Dec 2024 13:35:09 +0100 Subject: [PATCH 26/46] mention tuning and cite paper --- paper/paper.bib | 18 ++++++++++++++++++ paper/paper.md | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index c9228bf3c..48bac9f2b 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -196,3 +196,21 @@ @article{wilkinson2016fair publisher={Nature Publishing Group} } +@article{bischl2023, + archivePrefix = {arXiv}, + arxivId = {2107.05847}, + author = {Bischl, Bernd and Binder, Martin and Lang, Michel and Pielok, Tobias and Richter, Jakob and Coors, Stefan and Thomas, Janek and Ullmann, Theresa and Becker, Marc and Boulesteix, Anne Laure and Deng, Difan and Lindauer, Marius}, + doi = {10.1002/WIDM.1484}, + eprint = {2107.05847}, + issn = {1942-4795}, + journal = {Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery}, + keywords = {automl,hyperparameter optimization,machine learning,model selection,tuning}, + month = {mar}, + number = {2}, + pages = {e1484}, + publisher = {John Wiley & Sons, Ltd}, + title = {{Hyperparameter optimization: Foundations, algorithms, best practices, and open challenges}}, + url = {https://wires.onlinelibrary.wiley.com/doi/10.1002/widm.1484}, + volume = {13}, + year = {2023} +} diff --git a/paper/paper.md b/paper/paper.md index 5c6919028..c412fe0dc 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -116,7 +116,7 @@ The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is t At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. -It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale]. +It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale] as well as enabling efficient hyperparameter tuning [@bischl2023]. An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html). Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. From 1d503ef435dcdd674a441c855e0d723deb3280fe Mon Sep 17 00:00:00 2001 From: john Date: Tue, 14 Jan 2025 13:38:52 +0100 Subject: [PATCH 27/46] add Liana's affiliation --- paper/paper.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paper/paper.md b/paper/paper.md index c412fe0dc..a950ce7cf 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -58,6 +58,7 @@ authors: orchid: 0009-0007-0949-8385 affiliation: "3, 11, 12" - name: Liana Harutyunyan + affiliation: 18 - name: Pierre Camilleri orchid: 0009-0005-1070-0670 affiliation: 17 @@ -105,6 +106,8 @@ affiliations: index: 16 - name: multi, 8 passage Brûlon, 75012 PARIS, France index: 17 + - name: ServiceTitan, Inc., Glendale, California + index: 18 date: XXX December 2024 bibliography: paper.bib --- From 3863cb8e2d80e44ca9c4ce7d63919224dbfc91da Mon Sep 17 00:00:00 2001 From: john Date: Tue, 4 Feb 2025 16:58:18 +0100 Subject: [PATCH 28/46] update author affiliations --- paper/paper.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index a950ce7cf..f48e36165 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -15,7 +15,7 @@ authors: affiliation: 4 - name: Raphael Sonabend orcid: 0000-0001-9225-4654 - affiliation: "15, 16" + affiliation: 15 - name: Marc Becker orcid: 0000-0002-8115-0400 affiliation: "2, 3" @@ -58,10 +58,10 @@ authors: orchid: 0009-0007-0949-8385 affiliation: "3, 11, 12" - name: Liana Harutyunyan - affiliation: 18 + affiliation: 17 - name: Pierre Camilleri orchid: 0009-0005-1070-0670 - affiliation: 17 + affiliation: 16 - name: Philipp Kopper orchid: 0000-0002-5037-7135 affiliation: 3 @@ -102,12 +102,10 @@ affiliations: index: 14 - name: OSPO Now index: 15 - - name: Imperial College London - index: 16 - name: multi, 8 passage Brûlon, 75012 PARIS, France - index: 17 + index: 16 - name: ServiceTitan, Inc., Glendale, California - index: 18 + index: 17 date: XXX December 2024 bibliography: paper.bib --- From 6926bf755f4be8c6f592dcc260078ab3815b4f6b Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:16:29 +0100 Subject: [PATCH 29/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index f48e36165..6057bfc4d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -122,7 +122,7 @@ An overview of all `mlr3` learners, including those introduced through `mlr3extr Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. -Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. +Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular automated sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the R ecosystem [@wilkinson2016fair]). From 138bc59feba205870f661c19568b8f2cf941b4ef Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:17:01 +0100 Subject: [PATCH 30/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 6057bfc4d..2a80677bc 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -177,7 +177,7 @@ To streamline the addition of new learners, `mlr3extralearners` provides robust - **Code Templates**: Predefined templates are available for generating the necessary code for both the learner implementation and associated test files. These templates are generated through an `R` function that uses learner metadata to prefill as much information as possible, leaving only minimal input required from the contributor. -- **Guides and Resources**: The package website contains an contains an [extensive tutorial](https://mlr3extralearners.mlr-org.com/articles/extending.html), as well as a curated list of [common issues](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered during learner integration, making the process accessible for contributors of all experience levels. +- **Guides and Resources**: The package website contains an [extensive tutorial](https://mlr3extralearners.mlr-org.com/articles/extending.html), as well as a curated list of [common issues](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered during learner integration, making the process accessible for contributors of all experience levels. Additionally, every integrated learner includes a simple example of usage in the documentation, ensuring that users can quickly understand how to utilize the learner effectively within the `mlr3` ecosystem. # Community Impact and Future Directions From 1b1e020d747e53cbd4773def01f9305262cdfba5 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:17:45 +0100 Subject: [PATCH 31/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 2a80677bc..fe7c3d6d0 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -158,7 +158,7 @@ Each parameter is explicitly typed, with annotations for valid ranges and allowa This ensures valid configurations and simplifies tasks like parameter tuning. - **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or class predictions). This allows users to easily identify suitable learners for their specific modeling tasks. -- **Standardized Properties**: Learners are annotated with detailed attributes, including the types of features they can process and their support for functionalities such as feature selection such as feature selection, importance scoring, handling missing values, or monitoring performance on a separate validation set during training among others. +- **Standardized Properties**: Learners are annotated with detailed attributes, including the types of features they can process and their support for functionalities such as feature selection, importance scoring, handling missing values, or monitoring performance on a separate validation set during training among others. This allows users to have a clear understanding of a learner's capabilities and limitations and assess if it aligns with the specific requirements of their workflows, reducing trial-and-error and streamlining the modeling process. ## Functional Correctness From 1e88216a6fb8d57294d07e3713d623bbc217615d Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:17:55 +0100 Subject: [PATCH 32/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index fe7c3d6d0..5c3148212 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -113,7 +113,7 @@ bibliography: paper.bib # Summary The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@Lang2019] ecosystem. -The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` and is targeted towards both practitioners and researchers [@Bischl2024]. +The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` that is targeted towards both practitioners and researchers [@Bischl2024]. At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. From fbf9cc558d0844fa0d9d8949c625655acb9bb38e Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:18:27 +0100 Subject: [PATCH 33/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 5c3148212..64bb6a396 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -128,7 +128,7 @@ By providing a standardized interface and comprehensive metadata for each learne # Statement of Need -Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as performance, interpretability, or compatibility with specific data types and tasks. +Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as predictive performance, inference latency and throughput, interpretability, or compatibility with specific data types and tasks. To address this challenge, packages like `caret` [@caret] and `parsnip` [@parsnip] from the `tidymodels` ecosystem have historically provided unified interfaces for simplifying model experimentation [@tidymodels]. For instance, `parsnip` provides a clean and consistent way to define models, enabling users to experiment with different algorithms without dealing with the nuances of underlying package syntax. Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for machine learning in R. From bed2de639691fcbebed3154b427c823c5edfb28e Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:18:44 +0100 Subject: [PATCH 34/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 64bb6a396..fe91ca2ca 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -153,7 +153,7 @@ However, the advantages of `mlr3extralearners` go well beyond simple integration One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. -- **Parameter Management**: The parameter spaces of learners are defined using parameter sets from the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. +- **Hyperparameter Management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. Each parameter is explicitly typed, with annotations for valid ranges and allowable values. This ensures valid configurations and simplifies tasks like parameter tuning. - **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or class predictions). From 0d14260a4559591d5c1b48e0fc1a9a4550318436 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:18:57 +0100 Subject: [PATCH 35/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index fe91ca2ca..a91e0eb79 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -154,7 +154,7 @@ However, the advantages of `mlr3extralearners` go well beyond simple integration One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. - **Hyperparameter Management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. -Each parameter is explicitly typed, with annotations for valid ranges and allowable values. +Each hyperparameter is explicitly typed, with annotations for valid ranges and allowable values. This ensures valid configurations and simplifies tasks like parameter tuning. - **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or class predictions). This allows users to easily identify suitable learners for their specific modeling tasks. From 78b06d20b69389f81349447c66345a3e93feb414 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:19:08 +0100 Subject: [PATCH 36/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index a91e0eb79..7978fc129 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -155,7 +155,7 @@ One core feature of the `mlr3` ecosystem is that it annotates learners with exte - **Hyperparameter Management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. Each hyperparameter is explicitly typed, with annotations for valid ranges and allowable values. -This ensures valid configurations and simplifies tasks like parameter tuning. +This ensures valid configurations and simplifies tasks like hyperparameter tuning. - **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or class predictions). This allows users to easily identify suitable learners for their specific modeling tasks. - **Standardized Properties**: Learners are annotated with detailed attributes, including the types of features they can process and their support for functionalities such as feature selection, importance scoring, handling missing values, or monitoring performance on a separate validation set during training among others. From 220b70683931399556e1bd1c48f446741138e10b Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:19:22 +0100 Subject: [PATCH 37/46] Update paper/paper.md Co-authored-by: mb706 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 7978fc129..d6137050d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -156,7 +156,7 @@ One core feature of the `mlr3` ecosystem is that it annotates learners with exte - **Hyperparameter Management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. Each hyperparameter is explicitly typed, with annotations for valid ranges and allowable values. This ensures valid configurations and simplifies tasks like hyperparameter tuning. -- **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or class predictions). +- **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or response predictions). This allows users to easily identify suitable learners for their specific modeling tasks. - **Standardized Properties**: Learners are annotated with detailed attributes, including the types of features they can process and their support for functionalities such as feature selection, importance scoring, handling missing values, or monitoring performance on a separate validation set during training among others. This allows users to have a clear understanding of a learner's capabilities and limitations and assess if it aligns with the specific requirements of their workflows, reducing trial-and-error and streamlining the modeling process. From 4aecb11c60ee0c4db864dc652e2c4edf1d2b22b2 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:29:50 +0100 Subject: [PATCH 38/46] make links footnotes --- paper/paper.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index d6137050d..118fdc12b 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -112,20 +112,24 @@ bibliography: paper.bib # Summary -The [`mlr3extralearners`](https://mlr3extralearners.mlr-org.com/) [`R`](https://www.r-project.org/) [@R] package is a community-driven package that integrates external machine learning algorithms into the [`mlr3`](https://mlr3.mlr-org.com/) [@Lang2019] ecosystem. +The `mlr3extralearners` `R` [@R] package is a community-driven package that integrates external machine learning algorithms into the `mlr3` [@Lang2019] ecosystem. The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` that is targeted towards both practitioners and researchers [@Bischl2024]. At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale] as well as enabling efficient hyperparameter tuning [@bischl2023]. -An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the [mlr3 website](https://mlr-org.com/learners.html). +An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the mlr3 website.[^1] + +[^1]: https://mlr-org.com/learners.html Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular automated sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. -In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the [`mlr` R-universe](https://mlr-org.r-universe.dev/). +In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the `mlr` R-universe.[^2] By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the R ecosystem [@wilkinson2016fair]). +[^2]: https://mlr-org.r-universe.dev + # Statement of Need Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as predictive performance, inference latency and throughput, interpretability, or compatibility with specific data types and tasks. @@ -153,7 +157,7 @@ However, the advantages of `mlr3extralearners` go well beyond simple integration One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. -- **Hyperparameter Management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the [`paradox` package](https://paradox.mlr-org.com/) [@paradox]. +- **Hyperparameter Management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the `paradox` package [@paradox]. Each hyperparameter is explicitly typed, with annotations for valid ranges and allowable values. This ensures valid configurations and simplifies tasks like hyperparameter tuning. - **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or response predictions). @@ -177,9 +181,11 @@ To streamline the addition of new learners, `mlr3extralearners` provides robust - **Code Templates**: Predefined templates are available for generating the necessary code for both the learner implementation and associated test files. These templates are generated through an `R` function that uses learner metadata to prefill as much information as possible, leaving only minimal input required from the contributor. -- **Guides and Resources**: The package website contains an [extensive tutorial](https://mlr3extralearners.mlr-org.com/articles/extending.html), as well as a curated list of [common issues](https://mlr3extralearners.mlr-org.com/articles/common_issues.html) encountered during learner integration, making the process accessible for contributors of all experience levels. +- **Guides and Resources**: The package website[^3] contains an extensive tutorial, as well as a curated list of common issues encountered during learner integration, making the process accessible for contributors of all experience levels. Additionally, every integrated learner includes a simple example of usage in the documentation, ensuring that users can quickly understand how to utilize the learner effectively within the `mlr3` ecosystem. +[^3]: https://mlr3extralearners.mlr-org.com + # Community Impact and Future Directions `mlr3extralearners` is a direct result of the contributions from a diverse community of authors and developers. From bb3bc1affa73e8fc92ff6612dd4fe1f79ad21539 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:57:15 +0100 Subject: [PATCH 39/46] improve mlr3extralearners explanation --- paper/paper.bib | 8 ++++++++ paper/paper.md | 24 +++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 48bac9f2b..d24e27ae1 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -214,3 +214,11 @@ @article{bischl2023 volume = {13}, year = {2023} } + +@Manual{mlr3learners, + title = {mlr3learners: Recommended Learners for 'mlr3'}, + author = {Michel Lang and Quay Au and Stefan Coors and Patrick Schratz and Marc Becker}, + year = {2024}, + note = {R package version 0.9.0}, + url = {https://CRAN.R-project.org/package=mlr3learners}, +} diff --git a/paper/paper.md b/paper/paper.md index 118fdc12b..a00635712 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -112,10 +112,11 @@ bibliography: paper.bib # Summary -The `mlr3extralearners` `R` [@R] package is a community-driven package that integrates external machine learning algorithms into the `mlr3` [@Lang2019] ecosystem. -The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` that is targeted towards both practitioners and researchers [@Bischl2024]. -At its core, the `mlr3extralearners` package provides a standardized interface for machine learning and connects many R packages implementing machine learning algorithms into a unified framework. -The package currently wraps **85 different learning algorithms** from many different R packages, for tasks such as classification, regression, and survival analysis. +The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` [@R] that is targeted towards both practitioners and researchers [@Bischl2024]. +The core `mlr3` package [@Lang2019] defines the standardized interface for machine learning, but does itself not implement many learning algorithms. +A collection of 21 recommended learning algorithms is available in the `mlr3learners` extension [@mlr3learners] which makes methods from various `R` packages available via the `mlr3` interface. +The `mlr3extralearners` `R` package is a *community-driven* package that integrates many more external machine learning algorithms into the `mlr3` ecosystem and allows users to request and contribute new learners. +The package currently wraps **85 different learning algorithms** from many different `R` packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale] as well as enabling efficient hyperparameter tuning [@bischl2023]. An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the mlr3 website.[^1] @@ -124,9 +125,9 @@ An overview of all `mlr3` learners, including those introduced through `mlr3extr Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. -Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular automated sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying R packages. +Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular automated sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying `R` packages. In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the `mlr` R-universe.[^2] -By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the R ecosystem [@wilkinson2016fair]). +By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the `R` ecosystem [@wilkinson2016fair]). [^2]: https://mlr-org.r-universe.dev @@ -135,7 +136,7 @@ By providing a standardized interface and comprehensive metadata for each learne Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as predictive performance, inference latency and throughput, interpretability, or compatibility with specific data types and tasks. To address this challenge, packages like `caret` [@caret] and `parsnip` [@parsnip] from the `tidymodels` ecosystem have historically provided unified interfaces for simplifying model experimentation [@tidymodels]. For instance, `parsnip` provides a clean and consistent way to define models, enabling users to experiment with different algorithms without dealing with the nuances of underlying package syntax. -Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for machine learning in R. +Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for machine learning in `R`. Within this ecosystem, `mlr3extralearners` plays a crucial role by providing a comprehensive collection of external machine learning algorithms integrated into the `mlr3` framework. This ensures that users can access a wide variety of learners to meet their specific needs, @@ -167,10 +168,10 @@ This allows users to have a clear understanding of a learner's capabilities and ## Functional Correctness -Integrating learners from diverse R packages poses challenges, particularly due to changes in upstream APIs. `mlr3extralearners` addresses these issues through rigorous checks: +Integrating learners from diverse `R` packages poses challenges, particularly due to changes in upstream APIs. `mlr3extralearners` addresses these issues through rigorous checks: - **Interface Consistency**: The package regularly verifies that each learner adheres to the expected interface of its upstream function. -When new parameters are introduced or existing ones change, the tests fail until the parameter sets are updated accordingly. +When new parameters are introduced or existing ones changed or removed, the tests fail until the parameter sets are updated accordingly. - **Automated Testing**: To ensure correctness, `mlr3extralearners` performs regular automated tests on all learners. These tests include sanity checks that, e.g., verify that the learners produce sensible predictions for simple tasks. Furthermore, the tests also validate the learners' metadata annotations, such as whether a learner can actually handle missing values or is able to produce importance scores. @@ -179,8 +180,9 @@ Furthermore, the tests also validate the learners' metadata annotations, such as To streamline the addition of new learners, `mlr3extralearners` provides robust support tools: -- **Code Templates**: Predefined templates are available for generating the necessary code for both the learner implementation and associated test files. -These templates are generated through an `R` function that uses learner metadata to prefill as much information as possible, leaving only minimal input required from the contributor. +- **Code Templates**: Predefined templates are available for both the learner implementation and associated test files. +Contributors can utilize these templates through an `R` function that accepts learner metadata and generates new `R` code files based on the templates. +This approach pre-fills as much information as possible, minimizing the input required from the contributor. - **Guides and Resources**: The package website[^3] contains an extensive tutorial, as well as a curated list of common issues encountered during learner integration, making the process accessible for contributors of all experience levels. Additionally, every integrated learner includes a simple example of usage in the documentation, ensuring that users can quickly understand how to utilize the learner effectively within the `mlr3` ecosystem. From a440c108aacee7f5e297b6f1209f789b889be3d2 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 05:59:39 +0100 Subject: [PATCH 40/46] wording --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index a00635712..c80f9b49b 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -115,7 +115,7 @@ bibliography: paper.bib The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` [@R] that is targeted towards both practitioners and researchers [@Bischl2024]. The core `mlr3` package [@Lang2019] defines the standardized interface for machine learning, but does itself not implement many learning algorithms. A collection of 21 recommended learning algorithms is available in the `mlr3learners` extension [@mlr3learners] which makes methods from various `R` packages available via the `mlr3` interface. -The `mlr3extralearners` `R` package is a *community-driven* package that integrates many more external machine learning algorithms into the `mlr3` ecosystem and allows users to request and contribute new learners. +The `mlr3extralearners` `R` package is a *community-driven* package that integrates many more external machine learning algorithms into the `mlr3` ecosystem and allows users to request and contribute new learner integrations. The package currently wraps **85 different learning algorithms** from many different `R` packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale] as well as enabling efficient hyperparameter tuning [@bischl2023]. From 0da64fe7f07761a93b936a07a81b3857b8039fe7 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 06:14:29 +0100 Subject: [PATCH 41/46] small changes --- paper/paper.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index c80f9b49b..3f188c900 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -115,7 +115,7 @@ bibliography: paper.bib The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` [@R] that is targeted towards both practitioners and researchers [@Bischl2024]. The core `mlr3` package [@Lang2019] defines the standardized interface for machine learning, but does itself not implement many learning algorithms. A collection of 21 recommended learning algorithms is available in the `mlr3learners` extension [@mlr3learners] which makes methods from various `R` packages available via the `mlr3` interface. -The `mlr3extralearners` `R` package is a *community-driven* package that integrates many more external machine learning algorithms into the `mlr3` ecosystem and allows users to request and contribute new learner integrations. +The `mlr3extralearners` `R` package is a *community-driven* package that integrates many more external machine learning algorithms into the `mlr3` ecosystem. The package currently wraps **85 different learning algorithms** from many different `R` packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale] as well as enabling efficient hyperparameter tuning [@bischl2023]. @@ -124,10 +124,10 @@ An overview of all `mlr3` learners, including those introduced through `mlr3extr [^1]: https://mlr-org.com/learners.html Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. -This **enriches each learner with extensive metadata** about its parameter space, prediction types, and other key attributes. +This **enriches each learner with extensive metadata** about its hyperparameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular automated sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying `R` packages. In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the `mlr` R-universe.[^2] -By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the `R` ecosystem [@wilkinson2016fair]). +By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the `R` ecosystem [@wilkinson2016fair]. [^2]: https://mlr-org.r-universe.dev From ec51463db123ea88397979c5d1dfb007081303ca Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 06:19:56 +0100 Subject: [PATCH 42/46] some more fixes --- paper/paper.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 3f188c900..1cdca1e4d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -25,48 +25,49 @@ authors: - name: Martin Binder affiliation: 2 - name: Lennart Schneider - orchid: 0000-0003-4152-5308 + orcid: 0000-0003-4152-5308 affiliation: "2, 3" - name: Lukas Burk - orchid: 0000-0001-7528-3795 - affiliation: "2, 3, 4, 5" + orcid: 0000-0001-7528-3795 + affiliation: "2, 3, 5, 6" - name: Patrick Schratz orcid: 0000-0003-0748-6624 affiliation: 2 - name: Byron C. Jaeger - orchid: 0000-0001-7399-2299 + orcid: 0000-0001-7399-2299 affiliation: 13 - name: Stephen A Lauer - orchid: 0000-0003-2948-630X + orcid: 0000-0003-2948-630X affiliation: 7 - name: Lorenz A. Kapsner - orchid: 0000-0003-1866-860X + orcid: 0000-0003-1866-860X affiliation: 8 - name: Maximilian Mücke - orchid: 0009-0000-9432-9795 + orcid: 0009-0000-9432-9795 affiliation: 2 - - name: Zezhi Wang - orchid: 0000-0001-6988-5853 + - name: Zezhi Wang + orcid: 0000-0001-6988-5853 affiliation: 9 - name: Damir Pulatov - orchid: 0000-0003-4901-7201 + orcid: 0000-0003-4901-7201 affiliation: 14 - name: Keenan Ganz - orchid: 0000-0002-8486-3959 + orcid: 0000-0002-8486-3959 affiliation: 10 - name: Henri Funk - orchid: 0009-0007-0949-8385 + orcid: 0009-0007-0949-8385 affiliation: "3, 11, 12" - name: Liana Harutyunyan + orcid: 0000-0002-8486-3959 affiliation: 17 - name: Pierre Camilleri - orchid: 0009-0005-1070-0670 + orcid: 0009-0005-1070-0670 affiliation: 16 - name: Philipp Kopper - orchid: 0000-0002-5037-7135 + orcid: 0000-0002-5037-7135 affiliation: 3 - name: Andreas Bender - orchid: 0000-0001-5628-8611 + orcid: 0000-0001-5628-8611 affiliation: "2, 3" - name: Bernd Bischl orcid: 0000-0001-6002-6980 @@ -96,7 +97,7 @@ affiliations: index: 11 - name: Statistical Consulting Unit StaBLab, LMU Munich, Germany index: 12 - - name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health SciencesWinston-Salem, North Carolina + - name: Wake Forest University School of Medicine, Department of Biostatistics and Data Science, Division of Public Health Sciences Winston-Salem, North Carolina index: 13 - name: University of North Carolina Wilmington index: 14 From ab5686df92ce8b0a70f3e0c2ba39829aec4b8e90 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 10 Mar 2025 06:31:19 +0100 Subject: [PATCH 43/46] fix affiliations --- paper/paper.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 1cdca1e4d..1d21b6974 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -21,9 +21,9 @@ authors: affiliation: "2, 3" - name: Michel Lang orcid: 0000-0001-9754-0393 - affiliation: "1, 2" + affiliation: "1" - name: Martin Binder - affiliation: 2 + affiliation: "2, 3" - name: Lennart Schneider orcid: 0000-0003-4152-5308 affiliation: "2, 3" @@ -32,7 +32,7 @@ authors: affiliation: "2, 3, 5, 6" - name: Patrick Schratz orcid: 0000-0003-0748-6624 - affiliation: 2 + affiliation: 18 - name: Byron C. Jaeger orcid: 0000-0001-7399-2299 affiliation: 13 @@ -107,6 +107,8 @@ affiliations: index: 16 - name: ServiceTitan, Inc., Glendale, California index: 17 + - name: devXY GmbH + index: 18 date: XXX December 2024 bibliography: paper.bib --- From f6138ceeb34739947d29e18c975333f666e10650 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 24 Mar 2025 13:49:09 +0100 Subject: [PATCH 44/46] iterate bb review --- paper/paper.bib | 58 +++++++++++++++++++++++++++++++++++++++++ paper/paper.md | 69 +++++++++++++++++++++++++++---------------------- 2 files changed, 96 insertions(+), 31 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index d24e27ae1..2ebf7375a 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -222,3 +222,61 @@ @Manual{mlr3learners note = {R package version 0.9.0}, url = {https://CRAN.R-project.org/package=mlr3learners}, } + + + @Manual{mlr3oml, + title = {mlr3oml: Connector Between 'mlr3' and 'OpenML'}, + author = {Michel Lang and Sebastian Fischer}, + year = {2024}, + note = {R package version 0.10.0}, + url = {https://CRAN.R-project.org/package=mlr3oml}, + } + + @article{vanschoren2014openml, + title={OpenML: networked science in machine learning}, + author={Vanschoren, Joaquin and Van Rijn, Jan N and Bischl, Bernd and Torgo, Luis}, + journal={ACM SIGKDD Explorations Newsletter}, + volume={15}, + number={2}, + pages={49--60}, + year={2014}, + publisher={ACM New York, NY, USA} +} + + +@article{binder2020collecting, + title={Collecting empirical data about hyperparameters for data driven AutoML}, + author={Binder, Martin and Pfisterer, Florian and Bischl, Bernd}, + journal={Democratizing Machine Learning Contributions in AutoML and Fairness}, + pages={93}, + year={2020} +} + +@Manual{mlr3batchmark, + title = {mlr3batchmark: Batch Experiments for 'mlr3'}, + author = {Marc Becker and Michel Lang}, + year = {2024}, + note = {R package version 0.2.0}, + url = {https://CRAN.R-project.org/package=mlr3batchmark}, +} + +@article{bischl2016mlr, + title={mlr: Machine Learning in R}, + author={Bischl, Bernd and Lang, Michel and Kotthoff, Lars and Schiffner, Julia and Richter, Jakob and Studerus, Erich and Casalicchio, Giuseppe and Jones, Zachary M}, + journal={Journal of Machine Learning Research}, + volume={17}, + number={170}, + pages={1--5}, + year={2016} +} + + +@article{lang2017batchtools, + title={batchtools: Tools for R to work on batch systems}, + author={Lang, Michel and Bischl, Bernd and Surmann, Dirk}, + journal={Journal of Open Source Software}, + volume={2}, + number={10}, + pages={135}, + year={2017} +} diff --git a/paper/paper.md b/paper/paper.md index 1d21b6974..08ba31477 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -109,46 +109,49 @@ affiliations: index: 17 - name: devXY GmbH index: 18 -date: XXX December 2024 +date: 24 March 2025 bibliography: paper.bib --- # Summary -The `mlr3` ecosystem is a versatile toolbox for machine learning in `R` [@R] that is targeted towards both practitioners and researchers [@Bischl2024]. -The core `mlr3` package [@Lang2019] defines the standardized interface for machine learning, but does itself not implement many learning algorithms. -A collection of 21 recommended learning algorithms is available in the `mlr3learners` extension [@mlr3learners] which makes methods from various `R` packages available via the `mlr3` interface. -The `mlr3extralearners` `R` package is a *community-driven* package that integrates many more external machine learning algorithms into the `mlr3` ecosystem. -The package currently wraps **85 different learning algorithms** from many different `R` packages, for tasks such as classification, regression, and survival analysis. +The `mlr3` ecosystem is a versatile toolbox for machine learning (ML) in `R` [@R] that is targeted towards both practitioners and researchers [@Bischl2024]. +The core `mlr3` package [@Lang2019] defines the standardized interface for ML, but its goal is not to implement algorithms. +This is, e.g., done by the `mlr3learners` extension [@mlr3learners] that connects 21 stable algorithms from various `R` packages to the `mlr3` ecosystem that serve as a good starting point for many ML tasks. +In addition, `mlr3extralearners` is a *community-driven* package that integrates many more ML algorithms. +The package currently wraps **85 different ML algorithms** from many different `R` packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. -It also facilitates large-scale empirical benchmark experiments, leveraging the `mlr3` framework's parallelization and optimization capabilities [@benchlargescale] as well as enabling efficient hyperparameter tuning [@bischl2023]. -An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the mlr3 website.[^1] +One of its strengths is the design and implementation of large-scale benchmark experiments. +For example, datasets for such experiments can be easily obtained from the OpenML[^1] repository [@vanschoren2014openml] via the `mlr3oml` package [@mlr3oml]. +Furthermore, strong support for parallelization, including simplified submission on high-performance computing clusters via `batchtools` [@lang2017batchtools] and its `mlr3` integration `mlr3batchmark` [@mlr3batchmark], is provided by the framework and well documented [@benchlargescale]. +In combination, these tools allow for large-scale empirical investigations, which has, for example, been used to collect and analyze data about hyperparameter landscapes of ML algorithms [@binder2020collecting]. +An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the mlr3 website[^2]. -[^1]: https://mlr-org.com/learners.html +[^1]: https://openml.org +[^2]: https://mlr-org.com/learners.html -Beyond accessibility, `mlr3extralearners` also allows `mlr3` users and package developers to easily add their own learners to the ecosystem. +Beyond accessibility, `mlr3extralearners` also allows `mlr3` users to easily connect their own algorithms to the interface and also gives them the opportunity to share their implementations by contributing them to `mlr3extralearners`. This **enriches each learner with extensive metadata** about its hyperparameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular automated sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying `R` packages. -In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the `mlr` R-universe.[^2] -By providing a standardized interface and comprehensive metadata for each learner, mlr3extralearners enhances the FAIRness (findability, accessibility, interoperability, and reusability) of machine learning algorithms within the `R` ecosystem [@wilkinson2016fair]. +In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the `mlr` R-universe[^3]. +By providing a standardized interface and comprehensive metadata for each learner, `mlr3extralearners` enhances the FAIRness (findability, accessibility, interoperability, and reusability) of ML algorithms within the `R` ecosystem [@wilkinson2016fair]. -[^2]: https://mlr-org.r-universe.dev +[^3]: https://mlr-org.r-universe.dev # Statement of Need -Machine learning often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as predictive performance, inference latency and throughput, interpretability, or compatibility with specific data types and tasks. -To address this challenge, packages like `caret` [@caret] and `parsnip` [@parsnip] from the `tidymodels` ecosystem have historically provided unified interfaces for simplifying model experimentation [@tidymodels]. +ML often requires practitioners to navigate a diverse array of modeling problems, each with unique demands such as predictive performance, prediction speed, interpretability, or compatibility with specific data types and tasks. +To address this challenge, packages like `mlr3`'s predecessor `mlr` [@bischl2016mlr], `caret` [@caret], and more recently `parsnip` [@parsnip] from the `tidymodels` ecosystem [@tidymodels] were designed to provide unified interfaces for simplifying model experimentation. For instance, `parsnip` provides a clean and consistent way to define models, enabling users to experiment with different algorithms without dealing with the nuances of underlying package syntax. -Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for machine learning in `R`. +Similarly, the `mlr3` ecosystem aims to streamline model selection and experimentation, making it a versatile toolbox for ML in `R`. -Within this ecosystem, `mlr3extralearners` plays a crucial role by providing a comprehensive collection of external machine learning algorithms integrated into the `mlr3` framework. -This ensures that users can access a wide variety of learners to meet their specific needs, -and choose the most appropriate learner for their specific problem. +Within this ecosystem, `mlr3extralearners` plays a crucial role by providing a comprehensive collection of external ML algorithms integrated into the `mlr3` framework. +This ensures that users can access a wide variety of learners to meet their needs and choose the most appropriate algorithm for their particular problem. While connecting new learners to `mlr3` is straightforward and can be done on a per-need basis, integrating them into `mlr3extralearners` benefits the broader community by avoiding redundant effort and ensuring accessibility for all users. Additionally, contributions to `mlr3extralearners` are reviewed by the package maintainers, providing a layer of quality assurance. This review process ensures that integrated learners work as expected and adhere to the high standards of the `mlr3` ecosystem. -Beyond its utility for users, `mlr3extralearners` also offers significant advantages for developers of machine learning packages. +Beyond its utility for users, `mlr3extralearners` also offers significant advantages for developers of ML packages. By integrating a new algorithm into the `mlr3` ecosystem, developers can immediately make their methods accessible to a wider audience. This integration facilitates seamless tuning [@mlr3tuning] and preprocessing [@mlr3pipelines2021] through the broader `mlr3` framework, enhancing the usability and impact of their work. @@ -161,32 +164,35 @@ However, the advantages of `mlr3extralearners` go well beyond simple integration One core feature of the `mlr3` ecosystem is that it annotates learners with extensive metadata. -- **Hyperparameter Management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the `paradox` package [@paradox]. -Each hyperparameter is explicitly typed, with annotations for valid ranges and allowable values. +- **Hyperparameter management**: The hyperparameter spaces of learners are defined using `ParamSet` objects from the `paradox` package [@paradox]. +Each hyperparameter is explicitly typed, with annotations for feasible values. This ensures valid configurations and simplifies tasks like hyperparameter tuning. -- **Task and Prediction Types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or response predictions). +- **Task and prediction types**: Learners are categorized with respect to their task type (e.g. as classification, regression or survival analysis [@Sonabend2021]) and prediction types (e.g. probabilities or response predictions). This allows users to easily identify suitable learners for their specific modeling tasks. -- **Standardized Properties**: Learners are annotated with detailed attributes, including the types of features they can process and their support for functionalities such as feature selection, importance scoring, handling missing values, or monitoring performance on a separate validation set during training among others. +- **Standardized properties**: Learners are annotated with detailed attributes, including the types of features they can process and their support for functionalities such as feature selection, importance scoring, handling missing values, or monitoring performance on a separate validation set during training among others. This allows users to have a clear understanding of a learner's capabilities and limitations and assess if it aligns with the specific requirements of their workflows, reducing trial-and-error and streamlining the modeling process. ## Functional Correctness -Integrating learners from diverse `R` packages poses challenges, particularly due to changes in upstream APIs. `mlr3extralearners` addresses these issues through rigorous checks: +Integrating learners from diverse `R` packages poses challenges, on the one hand because changes in upstream APIs need to be reflected in `mlr3extralearners` and on the other hand because we want to ensure a high level of quality of algorithms connected to `mlr3`. +`mlr3extralearners` addresses both points through automated checks: -- **Interface Consistency**: The package regularly verifies that each learner adheres to the expected interface of its upstream function. +- **Interface consistency**: The package regularly verifies that each learner adheres to the expected interface of the latest released version of its upstream function. When new parameters are introduced or existing ones changed or removed, the tests fail until the parameter sets are updated accordingly. -- **Automated Testing**: To ensure correctness, `mlr3extralearners` performs regular automated tests on all learners. -These tests include sanity checks that, e.g., verify that the learners produce sensible predictions for simple tasks. +- **Automated testing**: In general, writing unit tests for ML algorithms is challenging, because of edge-cases, numeric errors, and the fact that the input to these algorithms can be arbitrary datasets. +To ensure correctness, `mlr3extralearners` performs regular automated tests on all learners. +These tests include sanity checks that, e.g., verify that the learners produce sensible predictions for simple randomly generated datasets. +In the past, these tests detected bugs in the implementation of upstream packages and we have subsequently notified upstream package authors. Furthermore, the tests also validate the learners' metadata annotations, such as whether a learner can actually handle missing values or is able to produce importance scores. ## Simplified Integration of New Learners To streamline the addition of new learners, `mlr3extralearners` provides robust support tools: -- **Code Templates**: Predefined templates are available for both the learner implementation and associated test files. +- **Code templates**: Predefined templates are available for both the learner implementation and associated test files. Contributors can utilize these templates through an `R` function that accepts learner metadata and generates new `R` code files based on the templates. This approach pre-fills as much information as possible, minimizing the input required from the contributor. -- **Guides and Resources**: The package website[^3] contains an extensive tutorial, as well as a curated list of common issues encountered during learner integration, making the process accessible for contributors of all experience levels. +- **Guides and resources**: The package website[^3] contains an extensive tutorial, as well as a curated list of common issues encountered during learner integration, making the process accessible for contributors of all experience levels. Additionally, every integrated learner includes a simple example of usage in the documentation, ensuring that users can quickly understand how to utilize the learner effectively within the `mlr3` ecosystem. [^3]: https://mlr3extralearners.mlr-org.com @@ -196,9 +202,10 @@ Additionally, every integrated learner includes a simple example of usage in the `mlr3extralearners` is a direct result of the contributions from a diverse community of authors and developers. The authors of this paper themselves have been actively involved in integrating learners, providing quality assurance, and maintaining the package's infrastructure. Their contributions, such as the addition of learners for specialized tasks like survival analysis and high-dimensional data, highlight the impact that thoughtful integration has on the `mlr3` ecosystem. -This ongoing effort illustrates the transformative potential of **community-driven development**, ensuring that `mlr3extralearners` continues to grow as a dynamic and inclusive repository for cutting-edge machine learning algorithms. +This ongoing effort illustrates the transformative potential of **community-driven development**, ensuring that `mlr3extralearners` continues to grow as a dynamic and inclusive repository for ML algorithms. By fostering a spirit of collaboration, the `mlr3extralearners` project invites future contributors to follow this example, helping shape the package's evolution and making advanced machine learning tools accessible to a wider `R` audience. +Future work will also focus on expanding the ecosystem through `mlr3torch` [@mlr3torch], which aims to seamlessly integrate deep learning models and neural network architectures within the `mlr3` framework. # Acknowledgements From 05736332e4d512b007942f56b3d5403c67b0e65a Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 24 Mar 2025 14:10:51 +0100 Subject: [PATCH 45/46] some more fixes --- paper/paper.bib | 9 +++++++++ paper/paper.md | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 2ebf7375a..932b9d820 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -280,3 +280,12 @@ @article{lang2017batchtools pages={135}, year={2017} } + + + @Manual{mlr3torch, + title = {mlr3torch: Deep Learning with 'mlr3'}, + author = {Sebastian Fischer and Martin Binder}, + year = {2025}, + note = {R package version 0.2.1}, + url = {https://CRAN.R-project.org/package=mlr3torch}, + } \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index 08ba31477..437f5ee79 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -117,20 +117,20 @@ bibliography: paper.bib The `mlr3` ecosystem is a versatile toolbox for machine learning (ML) in `R` [@R] that is targeted towards both practitioners and researchers [@Bischl2024]. The core `mlr3` package [@Lang2019] defines the standardized interface for ML, but its goal is not to implement algorithms. -This is, e.g., done by the `mlr3learners` extension [@mlr3learners] that connects 21 stable algorithms from various `R` packages to the `mlr3` ecosystem that serve as a good starting point for many ML tasks. -In addition, `mlr3extralearners` is a *community-driven* package that integrates many more ML algorithms. +This is, e.g., done by the `mlr3learners` extension [@mlr3learners] that connects 21 stable learning algorithms from various `R` packages to the `mlr3` ecosystem that serve as a good starting point for many ML tasks. +In addition, `mlr3extralearners` is a *community-driven* package that integrates many more methods. The package currently wraps **85 different ML algorithms** from many different `R` packages, for tasks such as classification, regression, and survival analysis. This enables users to seamlessly access and utilize these learners directly within their workflows. -One of its strengths is the design and implementation of large-scale benchmark experiments. +One of the strengths of `mlr3` is the design and implementation of large-scale benchmark experiments. For example, datasets for such experiments can be easily obtained from the OpenML[^1] repository [@vanschoren2014openml] via the `mlr3oml` package [@mlr3oml]. -Furthermore, strong support for parallelization, including simplified submission on high-performance computing clusters via `batchtools` [@lang2017batchtools] and its `mlr3` integration `mlr3batchmark` [@mlr3batchmark], is provided by the framework and well documented [@benchlargescale]. +Furthermore, strong support for parallelization, including execution on high-performance computing clusters via `batchtools` [@lang2017batchtools] and its `mlr3` integration `mlr3batchmark` [@mlr3batchmark], is available and well documented [@benchlargescale]. In combination, these tools allow for large-scale empirical investigations, which has, for example, been used to collect and analyze data about hyperparameter landscapes of ML algorithms [@binder2020collecting]. An overview of all `mlr3` learners, including those introduced through `mlr3extralearners`, is available on the mlr3 website[^2]. [^1]: https://openml.org [^2]: https://mlr-org.com/learners.html -Beyond accessibility, `mlr3extralearners` also allows `mlr3` users to easily connect their own algorithms to the interface and also gives them the opportunity to share their implementations by contributing them to `mlr3extralearners`. +Beyond accessibility, `mlr3extralearners` also allows `mlr3` users to easily connect their own algorithms to the interface. This **enriches each learner with extensive metadata** about its hyperparameter space, prediction types, and other key attributes. Furthermore, `mlr3extralearners` includes robust mechanisms for **quality assurance**, such as regular automated sanity checks and verification tests that ensure learner parameters are consistent and up-to-date with the latest versions of their underlying `R` packages. In order to allow the integration of learners that are not available on `CRAN`, the package is hosted on the `mlr` R-universe[^3]. @@ -180,10 +180,10 @@ Integrating learners from diverse `R` packages poses challenges, on the one hand - **Interface consistency**: The package regularly verifies that each learner adheres to the expected interface of the latest released version of its upstream function. When new parameters are introduced or existing ones changed or removed, the tests fail until the parameter sets are updated accordingly. - **Automated testing**: In general, writing unit tests for ML algorithms is challenging, because of edge-cases, numeric errors, and the fact that the input to these algorithms can be arbitrary datasets. -To ensure correctness, `mlr3extralearners` performs regular automated tests on all learners. +Aimed at addressing these challenges, `mlr3extralearners` performs regular automated tests on all learners. These tests include sanity checks that, e.g., verify that the learners produce sensible predictions for simple randomly generated datasets. -In the past, these tests detected bugs in the implementation of upstream packages and we have subsequently notified upstream package authors. Furthermore, the tests also validate the learners' metadata annotations, such as whether a learner can actually handle missing values or is able to produce importance scores. +In the past, these tests have detected bugs in some upstream packages and we have subsequently notified their authors. ## Simplified Integration of New Learners @@ -192,10 +192,11 @@ To streamline the addition of new learners, `mlr3extralearners` provides robust - **Code templates**: Predefined templates are available for both the learner implementation and associated test files. Contributors can utilize these templates through an `R` function that accepts learner metadata and generates new `R` code files based on the templates. This approach pre-fills as much information as possible, minimizing the input required from the contributor. -- **Guides and resources**: The package website[^3] contains an extensive tutorial, as well as a curated list of common issues encountered during learner integration, making the process accessible for contributors of all experience levels. +Note that these templates can also be used when learners are only used locally for specific projects and not contributed to `mlr3extralearners`. +- **Guides and resources**: The package website[^4] contains an extensive tutorial, as well as a curated list of common issues encountered during learner integration, making the process accessible for contributors of all experience levels. Additionally, every integrated learner includes a simple example of usage in the documentation, ensuring that users can quickly understand how to utilize the learner effectively within the `mlr3` ecosystem. -[^3]: https://mlr3extralearners.mlr-org.com +[^4]: https://mlr3extralearners.mlr-org.com # Community Impact and Future Directions @@ -204,8 +205,7 @@ The authors of this paper themselves have been actively involved in integrating Their contributions, such as the addition of learners for specialized tasks like survival analysis and high-dimensional data, highlight the impact that thoughtful integration has on the `mlr3` ecosystem. This ongoing effort illustrates the transformative potential of **community-driven development**, ensuring that `mlr3extralearners` continues to grow as a dynamic and inclusive repository for ML algorithms. -By fostering a spirit of collaboration, the `mlr3extralearners` project invites future contributors to follow this example, helping shape the package's evolution and making advanced machine learning tools accessible to a wider `R` audience. -Future work will also focus on expanding the ecosystem through `mlr3torch` [@mlr3torch], which aims to seamlessly integrate deep learning models and neural network architectures within the `mlr3` framework. +Future work will also focus on expanding the ecosystem with more deep learning methods through `mlr3torch` [@mlr3torch], which aims to seamlessly integrate deep learning models and neural network architectures within the `mlr3` framework. # Acknowledgements From 7fe6c04b1d6e0f35d98a4ddaa6a3484faa2734ce Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 24 Mar 2025 14:14:29 +0100 Subject: [PATCH 46/46] remove wrong orcid --- paper/paper.md | 1 - 1 file changed, 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 437f5ee79..b2f9d09e9 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -58,7 +58,6 @@ authors: orcid: 0009-0007-0949-8385 affiliation: "3, 11, 12" - name: Liana Harutyunyan - orcid: 0000-0002-8486-3959 affiliation: 17 - name: Pierre Camilleri orcid: 0009-0005-1070-0670