Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
name = "MLJBase"
uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
version = "1.13.0"
authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
version = "1.12.1"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e"
ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Expand All @@ -17,6 +18,7 @@ InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
LearnAPI = "92ad9a40-7767-427a-9ee6-6e577f1266cb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
Measurements = "eff96d63-e80a-5855-80a2-b1b0885c5ab7"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
Expand All @@ -43,12 +45,14 @@ DefaultMeasuresExt = "StatisticalMeasures"
CategoricalArrays = "1"
CategoricalDistributions = "0.2"
ComputationalResources = "0.3"
DataAPI = "1.16.0"
DelimitedFiles = "1"
Distributions = "0.25.3"
FillArrays = "1.14.0"
InvertedIndices = "1"
LearnAPI = "2"
MLJModelInterface = "1.11"
Measurements = "2.14"
Missings = "0.4, 1"
OrderedCollections = "1.1"
Parameters = "0.12"
Expand Down
7 changes: 5 additions & 2 deletions src/MLJBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ end
import LearnAPI
import StatisticalTraits.snakecase
import StatisticalTraits.info
import Measurements

# Interface

Expand Down Expand Up @@ -66,7 +67,8 @@ import PrettyTables
using DelimitedFiles
using OrderedCollections
using CategoricalArrays
import CategoricalArrays.DataAPI.unwrap
import DataAPI
import DataAPI: unwrap, describe
import InvertedIndices: Not
import Dates

Expand Down Expand Up @@ -175,6 +177,7 @@ include("resampling/evaluation_results.jl")
include("resampling/logging.jl")
include("resampling/evaluate.jl")
include("resampling/resampler.jl")
include("resampling/describe.jl")

include("hyperparam/one_dimensional_ranges.jl")
include("hyperparam/one_dimensional_range_methods.jl")
Expand Down Expand Up @@ -303,7 +306,7 @@ export TransformedTargetModel

# resampling.jl:
export ResamplingStrategy, InSample, Holdout, CV, StratifiedCV, TimeSeriesCV,
evaluate!, Resampler, PerformanceEvaluation, CompactPerformanceEvaluation
evaluate!, Resampler, PerformanceEvaluation, CompactPerformanceEvaluation, describe

# `MLJType` and the abstract `Model` subtypes are exported from within
# src/composition/abstract_types.jl
Expand Down
21 changes: 1 addition & 20 deletions src/composition/models/pipelines.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,6 @@

# # HELPERS

# modify collection of symbols to guarantee uniqueness. For example,
# individuate([:x, :y, :x, :x]) = [:x, :y, :x2, :x3])
function individuate(v)
isempty(v) && return v
ret = Symbol[first(v),]
for s in v[2:end]
s in ret || (push!(ret, s); continue)
n = 2
candidate = s
while true
candidate = string(s, n) |> Symbol
candidate in ret || break
n += 1
end
push!(ret, candidate)
end
return ret
end

function as_type(prediction_type::Symbol)
if prediction_type == :deterministic
return Deterministic
Expand Down Expand Up @@ -151,7 +132,7 @@ function pipe_named_tuple(names, components)
isempty(names) && throw(ERR_EMPTY_PIPELINE)

# make keys unique:
names = names |> individuate |> Tuple
names = names |> MLJBase.individuate |> Tuple

# check sequence:
supervised_components = filter(components) do c
Expand Down
64 changes: 64 additions & 0 deletions src/resampling/describe.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
describe(evaluation::MLJBase.AbstractPerformanceEvaluation)

Return a named tuple summarizing an MLJ performance evaluation, as returned by the methods
[`evaluate`](@ref) and [`evaluate!`](@ref). The summary includes all aggregated
measurements and their 95% radii of uncertainty. See also [`PerformanceEvaluation`](@ref).

This is particularly useful for tabulating multiple evaluations, as shown in the following
example, which assumes you have MLJ, NearestNeighborModels, and DecisionTree in your
package environment.

```julia-repl
using MLJ
X, y = @load_iris # a vector and a table

# instantiate two models:
KNNClassifier = @load KNNClassifier pkg=NearestNeighborModels
DecisionTreeClassifier = @load DecisionTreeClassifier pkg=DecisionTree
knn = KNNClassifier()
tree = DecisionTreeClassifier()

named_models = [
"Dummy" => ConstantClassifier(), # a built-in model
"K-nearest neighbors" => knn,
"Decision Tree" => tree,
]
performance_evaluations = evaluate(named_models, X, y; measures=[brier_score, accuracy])
julia> describe(performance_evaluations[1])
(tag = "Dummy", BrierScore = -0.573 ± 0.1, Accuracy = 0.33 ± 0.23)

table = describe.(performance_evaluations)
julia> pretty(table)
┌─────────────────────┬──────────────────────┬──────────────────────┐
│ tag │ BrierScore │ Accuracy │
│ String │ Measurement{Float64} │ Measurement{Float64} │
│ Textual │ Continuous │ Continuous │
├─────────────────────┼──────────────────────┼──────────────────────┤
│ Dummy │ -0.573±0.1 │ 0.33±0.23 │
│ K-nearest neighbors │ -0.21±0.21 │ 0.92±0.18 │
│ Decision Tree │ -0.00118977±0.0 │ 1.0±0.0 │
└─────────────────────┴──────────────────────┴──────────────────────┘
```


"""
function DataAPI.describe(e::AbstractPerformanceEvaluation)
key_value_pairs = Any[:tag=>e.tag]
measure_names = MLJBase.individuate(
map(e.measure) do measure
split(_repr_(measure), "(") |> first
end,
delim="",
)
for (i, name) in enumerate(measure_names)
value = e.measurement[i]
δ = e.uncertainty_radius_95[i]
if !isnothing(δ) && δ isa Real && !isinf(δ)
# decorate with uncertainty radius:
value = Measurements.measurement(value, δ)
end
push!(key_value_pairs, Symbol(name) => value)
end
NamedTuple(key_value_pairs)
end
33 changes: 24 additions & 9 deletions src/resampling/evaluate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,12 @@ log_evaluation(logger, performance_evaluation) = nothing

Estimate the performance of a machine `mach` wrapping a supervised model in data, using
the specified `resampling` strategy (defaulting to 6-fold cross-validation) and `measure`,
which can be a single measure or vector. Returns a [`PerformanceEvaluation`](@ref)
object.
which can be a single measure or vector. Returns a [`PerformanceEvaluation`](@ref) or
[`CompactPerformanceEvaluation`](@ref) object. To obtain a brief named tuple summary of
this object, suitable for tabulation, apply [`describe`](@ref).

In place of `mach`, one can use `tag_string => mach`, or a vector of either of these forms,
to return a vector of performance evaluation objects.
to return a vector of performance evaluation objects.

Available resampling strategies are $RESAMPLING_STRATEGIES_LIST. If `resampling` is not an
instance of one of these, then a vector of tuples of the form `(train_rows, test_rows)`
Expand Down Expand Up @@ -374,18 +375,32 @@ show(e)

Evaluate multiple machines:

```julia
```julia-repl
@load KNNClassifier pkg=NearestNeighborModels
mach1 = machine(ConstantClassifier(), X, y)
mach2 = machine(KNNClassifier(), X , y)
evaluate!(["const" => mach1, "knn" => mach2])
# 2-element Vector{...}
# PerformanceEvaluation("const", 0.698 ± 0.0062)
# PerformanceEvaluation("knn", 2.22e-16 ± 0.0)
julia> performance_evaluations = evaluate!(["const" => mach1, "knn" => mach2])
2-element Vector{...}
PerformanceEvaluation("const", 0.698 ± 0.0062)
PerformanceEvaluation("knn", 2.22e-16 ± 0.0)
```

Tabulate the results:

```julia-repl
describe.(performance_evaluations) |> pretty # or `|> DataFrames.DataFrame`
┌─────────┬──────────────────────┐
│ tag │ LogLoss │
│ String │ Measurement{Float64} │
│ Textual │ Continuous │
├─────────┼──────────────────────┤
│ const │ 0.6977±0.0062 │
│ knn │ 2.22045e-16±0.0 │
└─────────┴──────────────────────┘
```

See also [`evaluate`](@ref), [`PerformanceEvaluation`](@ref),
[`CompactPerformanceEvaluation`](@ref).
[`CompactPerformanceEvaluation`](@ref), [`describe`](@ref).

"""
function evaluate!(
Expand Down
47 changes: 47 additions & 0 deletions src/utilities.jl
Original file line number Diff line number Diff line change
Expand Up @@ -555,3 +555,50 @@ converted to a vector.

"""
guess_model_target_observation_scitype(model) = observation(target_scitype(model))


# ## MAKING A COLLECTION OF STRINGS OR SYMBOLS UNIQUE

append_digit(s::AbstractString, delim, n) = string(s, delim, n)
append_digit(s::Symbol, args...) = Symbol(append_digit(string(s), args...))

"""
MLJBase.individuate(collection, delim="", use_one=false)

Given a `collection` of strings or symbols, add numeric suffixes to some elements, to
distinguish repeated elements, and return these unique elements as a vector of the same
length as `collection`:

```julia-repl
julia> collection = ["cat", "dog", "cat", "mouse", "cat", "mouse"]
julia> MLJBase.individuate(collection)
"cat"
"dog"
"cat2"
"mouse"
"cat3"
"mouse2"

julia> MLJBase.individuate(collection, delim="_", use_one=true)
6-element Vector{String}:
"cat_1"
"dog"
"cat_2"
"mouse_1"
"cat_3"
"mouse_2"

```

"""
function individuate(iterator; delim="", use_one=false)
occurences_given_string = StatsBase.countmap(iterator)
d = deepcopy(occurences_given_string)
map(iterator) do s
occurences_given_string[s] == 1 && return s
digit = occurences_given_string[s] - d[s] + 1
d[s] = d[s] - 1
digit == 1 && !use_one && return s
return append_digit(s, delim, digit)
end
end
5 changes: 0 additions & 5 deletions test/composition/models/pipelines.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ import MLJBase: Pred, Trans

rng = StableRNG(698790187)

@testset "helpers" begin
@test MLJBase.individuate([:x, :y, :x, :z, :y, :x]) ==
[:x, :y, :x2, :z, :y2, :x3]
end


# # DUMMY MODELS

Expand Down
22 changes: 19 additions & 3 deletions test/resampling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ using StatisticalMeasures
import LearnAPI
import CategoricalDistributions
import MLJModelInterface
import Measurements

@everywhere begin
using .Models
Expand All @@ -20,9 +21,7 @@ using Test
using MLJBase
import Distributions
import StatsBase
@static if VERSION >= v"1.3.0-DEV.573"
using .Threads
end
using .Threads

struct DummyInterval <: Interval end
dummy_interval=DummyInterval()
Expand Down Expand Up @@ -1262,4 +1261,21 @@ MLJModelInterface.target_scitype(::Type{<:UnivariateFiniteFitter}) =
@test e.measurement[1] ≈ by_hand
end

@testset "describe" begin
X, y = make_moons(12)
y = coerce(y, OrderedFactor)
model = ConstantClassifier()
performance_evaluation = evaluate(
"const"=>model, X, y;
measures=[FScore(0.6), FScore(0.7), brier_loss, confmat],
)
summary = describe(performance_evaluation)
@test keys(summary) == (:tag, :FScore, :FScore2, :BrierLoss, :ConfusionMatrix)
@test summary.tag == "const"
bl = summary.BrierLoss
@test Measurements.value(bl) == performance_evaluation.measurement[3]
@test Measurements.uncertainty(bl) == performance_evaluation.uncertainty_radius_95[3]
@test summary.ConfusionMatrix == performance_evaluation.measurement[4]
end

true
14 changes: 14 additions & 0 deletions test/utilities.jl
Original file line number Diff line number Diff line change
Expand Up @@ -221,5 +221,19 @@ MLJBase.target_scitype(::Type{<:DRegressor2}) =
@test !contains(str, "Int64")
end

@testset "individuate" begin
@test MLJBase.individuate([:x, :y, :x, :z, :y, :x]) ==
[:x, :y, :x2, :z, :y2, :x3]
@test MLJBase.individuate([:x, :y, :x, :z, :y, :x], delim="_") ==
[:x, :y, :x_2, :z, :y_2, :x_3]
@test MLJBase.individuate(
["cat", "dog", "cat", "mouse", "cat", "mouse"],
use_one=true,
delim="_",
) == ["cat_1", "dog", "cat_2", "mouse_1", "cat_3", "mouse_2"]
@test MLJBase.individuate(["cat", "dog", "cat", "mouse", "cat", "mouse"]) ==
["cat", "dog", "cat2", "mouse", "cat3", "mouse2"]
end

end # module
true
Loading