diff --git a/DESCRIPTION b/DESCRIPTION index 93e966a..31b1a4a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,43 +1,43 @@ Package: Rforestry Type: Package Title: Random Forests, Linear Trees, and Gradient Boosting for Inference and Interpretability -Version: 0.9.0.3 -Author: Sören Künzel, - Theo Saarinen, - Simon Walter, - Edward Liu, - Allen Tang, - Jasjeet Sekhon +Version: 0.11.0.0 +Authors@R: c( + person("Sören", "Künzel", role = "aut"), + person("Theo", "Saarinen", role = c("aut","cre"), email = "theo_s@berkeley.edu"), + person("Simon", "Walter", role = "aut"), + person("Sam", "Antonyan", role = "aut"), + person("Edward", "Liu", role = "aut"), + person("Allen", "Tang", role = "aut"), + person("Jasjeet", "Sekhon", role = "aut") + ) Maintainer: Theo Saarinen BugReports: https://github.com/forestry-labs/Rforestry/issues URL: https://github.com/forestry-labs/Rforestry -Description: Provides fast implementations of Honest Random Forests, +Description: Provides fast implementations of Random Forests, Gradient Boosting, and Linear Random Forests, with an emphasis on inference and interpretability. Additionally contains methods for variable importance, out-of-bag prediction, regression monotonicity, and several methods for missing data imputation. License: GPL (>=3) | file LICENSE Encoding: UTF-8 -LazyData: true Imports: Rcpp (>= 0.12.9), parallel, methods, visNetwork, - glmnet, + glmnet (>= 4.1), grDevices, onehot LinkingTo: Rcpp, RcppArmadillo, RcppThread -RoxygenNote: 7.1.1 -SystemRequirements: C++11 +RoxygenNote: 7.2.3 Suggests: testthat, knitr, rmarkdown, mvtnorm -VignetteBuilder: knitr Collate: 'R_preprocessing.R' 'RcppExports.R' diff --git a/R/compute_rf_lp.R b/R/compute_rf_lp.R index 566a8cc..ac81a7a 100644 --- a/R/compute_rf_lp.R +++ b/R/compute_rf_lp.R @@ -34,7 +34,7 @@ compute_lp <- function(object, feature.new, feature, p){ # Checks and parsing: - if (class(object) != "forestry") { + if (!inherits(object, "forestry")) { stop("The object submitted is not a forestry random forest") } feature.new <- as.data.frame(feature.new) diff --git a/R/plottree.R b/R/plottree.R index 5c82741..94e58ff 100644 --- a/R/plottree.R +++ b/R/plottree.R @@ -28,7 +28,7 @@ #' replace = FALSE, #' nodesizeStrictSpl = 10, #' mtry = 4, -#' ntree = 1000, +#' ntree = 10, #' minSplitGain = .004, #' linear = TRUE, #' overfitPenalty = 1.65, @@ -36,7 +36,7 @@ #' #' plot(x = ridge_rf) #' plot(x = ridge_rf, tree.id = 2) -#' plot(x = ridge_rf, tree.id = 1000) +#' plot(x = ridge_rf, tree.id = 10) #' #' @export #' @import visNetwork diff --git a/README.md b/README.md index e12433c..ee0a98f 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,8 @@ and Linear Random Forests, with an emphasis on inference and interpretability. ## How to install 1. The GFortran compiler has to be up to date. GFortran Binaries can be found [here](https://gcc.gnu.org/wiki/GFortranBinaries). -2. The [devtools](https://github.com/hadley/devtools) package has to be installed. You can install it using, `install.packages("devtools")`. -3. The package contains compiled code, and you must have a development environment to install the development version. You can use `devtools::has_devel()` to check whether you do. If no development environment exists, Windows users download and install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) and macOS users download and install [Xcode](https://itunes.apple.com/us/app/xcode/id497799835). +2. The [devtools](https://github.com/r-lib/devtools) package has to be installed. You can install it using, `install.packages("devtools")`. +3. The package contains compiled code, and you must have a development environment to install the development version. You can use `devtools::has_devel()` to check whether you do. If no development environment exists, Windows users download and install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) and macOS users download and install [Xcode](https://apps.apple.com/us/app/xcode/id497799835). 4. The latest development version can then be installed using `devtools::install_github("forestry-labs/Rforestry")`. For Windows users, you'll need to skip 64-bit compilation `devtools::install_github("forestry-labs/Rforestry", INSTALL_opts = c('--no-multiarch'))` due to an outstanding gcc issue. diff --git a/man/plot-forestry.Rd b/man/plot-forestry.Rd index eaeaaf0..92d9f9f 100644 --- a/man/plot-forestry.Rd +++ b/man/plot-forestry.Rd @@ -37,7 +37,7 @@ ridge_rf <- forestry( replace = FALSE, nodesizeStrictSpl = 10, mtry = 4, - ntree = 1000, + ntree = 10, minSplitGain = .004, linear = TRUE, overfitPenalty = 1.65, @@ -45,6 +45,6 @@ ridge_rf <- forestry( plot(x = ridge_rf) plot(x = ridge_rf, tree.id = 2) -plot(x = ridge_rf, tree.id = 1000) +plot(x = ridge_rf, tree.id = 10) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 38e3554..0037874 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -2,10 +2,16 @@ // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include +#include #include using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // rcpp_cppDataFrameInterface SEXP rcpp_cppDataFrameInterface(Rcpp::List x, Rcpp::NumericVector y, Rcpp::NumericVector catCols, Rcpp::NumericVector linCols, int numRows, int numColumns, Rcpp::NumericVector featureWeights, Rcpp::NumericVector featureWeightsVariables, Rcpp::NumericVector deepFeatureWeights, Rcpp::NumericVector deepFeatureWeightsVariables, Rcpp::NumericVector observationWeights, Rcpp::NumericVector monotonicConstraints); RcppExport SEXP _Rforestry_rcpp_cppDataFrameInterface(SEXP xSEXP, SEXP ySEXP, SEXP catColsSEXP, SEXP linColsSEXP, SEXP numRowsSEXP, SEXP numColumnsSEXP, SEXP featureWeightsSEXP, SEXP featureWeightsVariablesSEXP, SEXP deepFeatureWeightsSEXP, SEXP deepFeatureWeightsVariablesSEXP, SEXP observationWeightsSEXP, SEXP monotonicConstraintsSEXP) { diff --git a/src/forestry.cpp b/src/forestry.cpp index d4b5be2..6e28ee3 100644 --- a/src/forestry.cpp +++ b/src/forestry.cpp @@ -165,7 +165,6 @@ void forestry::addTrees(size_t ntree) { const unsigned int see = this->getSeed(); size_t splitSampleSize = (size_t) (getSplitRatio() * getSampleSize()); - float percent_complete = 0.0; // Rcpp::Rcout << "Training Progress: " << std::endl; // R_FlushConsole(); @@ -913,7 +912,7 @@ std::vector>* forestry::neighborhoodImpute( // is inefficient because we have to iterate over the // vector about 1.5 times. double runningMax = -std::numeric_limits::infinity(); - size_t maxPosition; + size_t maxPosition=0; for(size_t l = 0; l < categoryContribution.size(); l++) { if(categoryContribution[l] > runningMax) { runningMax = categoryContribution[l]; diff --git a/src/forestryTree.cpp b/src/forestryTree.cpp index c4e4c7f..96a0d84 100644 --- a/src/forestryTree.cpp +++ b/src/forestryTree.cpp @@ -347,7 +347,7 @@ void splitDataIntoTwoParts( double rightMean = trainingData->partitionMean(rightPartitionIndex); for (const auto& index : naIndices) { - if (abs(trainingData->getOutcomePoint(index) - leftMean) < abs(trainingData->getOutcomePoint(index) - rightMean)) { + if (fabs(trainingData->getOutcomePoint(index) - leftMean) < fabs(trainingData->getOutcomePoint(index) - rightMean)) { leftPartitionIndex->push_back(index); naLeftCount++; } else { diff --git a/src/treeSplitting.cpp b/src/treeSplitting.cpp index 7556064..05f3848 100644 --- a/src/treeSplitting.cpp +++ b/src/treeSplitting.cpp @@ -1451,7 +1451,7 @@ void findBestSplitImpute( // If closer to left partitionmean, add to left sum, leftcount ++ // This is okay to do after we check monotonicity, this shouldn't change // the ordering of the partition means as we allocate the NA examples greedily - if (abs(currOutcome - leftPartitionMean) < abs(currOutcome - rightPartitionMean)) { + if (fabs(currOutcome - leftPartitionMean) < fabs(currOutcome - rightPartitionMean)) { LeftPartitionNaSum += currOutcome; leftPartitionNaCount++; } @@ -1638,7 +1638,7 @@ void findBestSplitImputeCategorical( double currOutcome = std::get<1>(pair); // If closer to left partitionmean, add to left sum, leftcount ++ - if (abs(currOutcome - leftPartitionMean) < abs(currOutcome - rightPartitionMean)) { + if (fabs(currOutcome - leftPartitionMean) < fabs(currOutcome - rightPartitionMean)) { LeftPartitionNaSum += currOutcome; leftPartitionNaCount++; } diff --git a/tests/testthat/test-OOBpredictions.R b/tests/testthat/test-OOBpredictions.R index 47a90b9..55a0870 100644 --- a/tests/testthat/test-OOBpredictions.R +++ b/tests/testthat/test-OOBpredictions.R @@ -25,11 +25,11 @@ test_that("Tests if OOB predictions are working correctly (normal setting)", { skip_if_not_mac() - expect_equal(all.equal(getOOBpreds(forest)[1:10], c(5.092647817, 4.664031165, - 4.650426049, 4.870883947, - 5.084049999, 5.344246144, - 5.069991851, 5.060238528, - 4.766551234, 4.790776227)), TRUE) + expect_equal(all.equal(getOOBpreds(forest)[1:10], c(5.09195014288, 4.66466649643, + 4.65042604918, 4.87281687100, + 5.08349279822, 5.34483093904, + 5.06971226922, 5.06069487707, + 4.76761805874, 4.79213639568)), TRUE) }) diff --git a/tests/testthat/test-compute_rf_lp.R b/tests/testthat/test-compute_rf_lp.R index a705fca..8634eef 100644 --- a/tests/testthat/test-compute_rf_lp.R +++ b/tests/testthat/test-compute_rf_lp.R @@ -33,14 +33,14 @@ test_that("Tests that compute the lp distances works correctly", { skip_if_not_mac() expect_equal(distances_1, - c(0.74127647652339, 0.56269154186560, 0.66700207007833, 0.48143305071905, - 0.42691537245113, 0.79361471149614, 0.69064814060102, 0.60005881782247, - 0.77731344373143, 0.53970499669885, 0.67328392159715), - tolerance = 1e-12) + c(0.741505444777, 0.557691541866, 0.667464191290, 0.481433050719, + 0.428715372451, 0.793614711496, 0.690142079995, 0.598410669674, + 0.777646777065, 0.542104996699, 0.671783921597), + tolerance = 1e-6) expect_equal(distances_2, - c(2.3726809930918, 2.4972611231916, 2.7047479310938, 1.9000801210562, - 1.6384876050554, 2.4063455932161, 2.1012051982558, 2.4272638737974, - 3.0785442045313, 2.4121460046764, 2.2978840528426), - tolerance = 1e-12) + c(2.37298192278, 2.48866693581, 2.70150942321, 1.90008012106, + 1.63721780222, 2.40468205396, 2.09935249333, 2.42295512410, + 3.07584694340, 2.41614965345, 2.29615354894), + tolerance = 1e-6) }) diff --git a/tests/testthat/test-forestry.R b/tests/testthat/test-forestry.R index 325d5bb..47dbe7f 100644 --- a/tests/testthat/test-forestry.R +++ b/tests/testthat/test-forestry.R @@ -31,7 +31,7 @@ test_that("Tests that random forest is working correctly", { skip_if_not_mac() mean((y_pred - y) ^ 2) - expect_equal(mean((y_pred - y) ^ 2), 0.064760523023031, tolerance = 1e-12) + expect_equal(mean((y_pred - y) ^ 2), 0.0646744712442, tolerance = 1e-6) # Test factors with missing obs and unused levels are correctly handled x$Species[1:70] <- NA @@ -39,5 +39,5 @@ test_that("Tests that random forest is working correctly", { x, y, seed = 2) y_pred <- predict(forest, x, seed = 2) - expect_equal(mean((y_pred - y) ^ 2), 0.10277113497796, tolerance = 1e-12) + expect_equal(mean((y_pred - y) ^ 2), 0.102692470551, tolerance = 1e-6) }) diff --git a/tests/testthat/test-impute_features.R b/tests/testthat/test-impute_features.R index 38e5166..ad401da 100644 --- a/tests/testthat/test-impute_features.R +++ b/tests/testthat/test-impute_features.R @@ -20,7 +20,7 @@ skip_if_not_mac() forest <- forestry(x_with_miss, y, ntree = 500, seed = 2, nthread = 1) imputed_x <- impute_features(forest, x_with_miss, seed = 2) expect_equal(sum(imputed_x$Species != x$Species), 2) -expect_equal(mean(abs(x$Sepal.Width - imputed_x$Sepal.Width)), 0.074894503323687369734, tolerance = 1e-6) +expect_equal(mean(abs(x$Sepal.Width - imputed_x$Sepal.Width)), 0.0748406666537, tolerance = 1e-6) # Testing mean imputation fallback: set.seed(1) diff --git a/tests/testthat/test-max-depth.R b/tests/testthat/test-max-depth.R index 454a039..01fde7e 100644 --- a/tests/testthat/test-max-depth.R +++ b/tests/testthat/test-max-depth.R @@ -28,5 +28,5 @@ test_that("Tests that maxDepth parameter is working correctly", { skip_if_not_mac() # Mean Square Error - expect_equal(sum((y_pred - y) ^ 2), 11.076804560351238393, tolerance = 1e-12) + expect_equal(sum((y_pred - y) ^ 2), 11.075656291850588531, tolerance = 1e-6) }) diff --git a/tests/testthat/test-multilayerForestry.R b/tests/testthat/test-multilayerForestry.R index 5cfa71a..977b6a8 100644 --- a/tests/testthat/test-multilayerForestry.R +++ b/tests/testthat/test-multilayerForestry.R @@ -33,5 +33,5 @@ test_that("Tests that multilayerForestry is working correctly", { # Multilayer forestry is non deterministic, this needs to be fixed, but for # now test that it at least runs without crashing - expect_equal(sum((y_pred - y) ^ 2), 13.849777575910220, tolerance = 1e-8) + expect_equal(sum((y_pred - y) ^ 2), 13.841433707739128067, tolerance = 1e-8) }) diff --git a/tests/testthat/test-observationWeights.R b/tests/testthat/test-observationWeights.R index af0e287..99d5c8d 100644 --- a/tests/testthat/test-observationWeights.R +++ b/tests/testthat/test-observationWeights.R @@ -31,7 +31,7 @@ test_that("Tests that observationWeights for the bootstrap is working correctly" skip_if_not_mac() # Check the predictions from a weighted forest - expect_equal(sum((y_pred - y) ^ 2), 8.658285584297157, tolerance = 1e-8) + expect_equal(sum((y_pred - y) ^ 2), 8.6556977338637377528, tolerance = 1e-8) forest <- make_savable(forest) diff --git a/vignettes/example.Rmd b/vignettes/example.Rmd deleted file mode 100644 index c1b4dd1..0000000 --- a/vignettes/example.Rmd +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: "hte -- Heterogeneous Treatment Effect Estimation" -author: "Soeren Kuenzel, Allen Tang, Peter Bickel, Bin Yu, Jasjeet Sekhon" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{hte -- Heterogeneous Treatment Effect Estimation} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -## Vignette Info - `vignette` `\VignetteIndexEntry` to match the title of your vignette. - -```{r, fig.show='hold', fig.cap = "Your figure caption."} -plot(1:10) -plot(10:1) -``` - -**knitr**. - -## More Examples - -$Y = X\beta + \epsilon$, footnotes^[A footnote here.], and tables, e.g. using `knitr::kable()`. - -```{r, echo=FALSE, results='asis'} -knitr::kable(head(mtcars, 10)) -``` - -Also a quote using `>`: - -> "He who gives up [code] safety for [code] speed deserves neither." -([via](https://twitter.com/hadleywickham/status/504368538874703872))