Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 29 additions & 20 deletions include/flucoma/algorithms/public/UMAP.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ namespace fluid {
namespace algorithm {

namespace impl {
template <typename RefeferenceArray>
template <typename RefeferenceArray, typename RNG>
void optimizeLayout(Eigen::ArrayXXd& embedding, RefeferenceArray& reference,
Eigen::ArrayXi const& embIndices,
Eigen::ArrayXi const& refIndices,
Eigen::ArrayXd const& epochsPerSample,
Eigen::VectorXd const& AB, bool updateReference,
double learningRate, index maxIter, double gamma = 1.0)
double learningRate, index maxIter, RNG& rng, double gamma = 1.0)
{
using namespace std;
using namespace Eigen;
Expand All @@ -33,8 +33,6 @@ void optimizeLayout(Eigen::ArrayXXd& embedding, RefeferenceArray& reference,
auto distance = DistanceFuncs::map()[DistanceFuncs::Distance::kSqEuclidean];
double a = AB(0);
double b = AB(1);
random_device rd;
mt19937 mt(rd());
uniform_int_distribution<index> randomInt(0, reference.rows() - 1);
ArrayXd epochsPerNegativeSample = epochsPerSample / negativeSampleRate;
ArrayXd nextEpoch = epochsPerSample;
Expand Down Expand Up @@ -65,7 +63,7 @@ void optimizeLayout(Eigen::ArrayXXd& embedding, RefeferenceArray& reference,
epochsPerNegativeSample(j));
for (index k = 0; k < numNegative; k++)
{
index negativeIndex = randomInt(mt);
index negativeIndex = randomInt(rng);
if (negativeIndex == embIndices(j)) continue;
ArrayXd negative = reference.row(negativeIndex);
dist = distance(current, negative);
Expand Down Expand Up @@ -129,7 +127,7 @@ class UMAP
using ArrayXi = Eigen::ArrayXi;
using VectorXd = Eigen::VectorXd;
using SparseMatrixXd = Eigen::SparseMatrix<double>;
using DataSet = FluidDataSet<std::string, double, 1>;
using DataSet = FluidDataSet<std::string, double, 1>; //fix me, this shouldn't care about id type
template <typename T>
using Ref = Eigen::Ref<T>;

Expand Down Expand Up @@ -172,11 +170,14 @@ class UMAP
bool initialized() const { return mInitialized; }

DataSet train(DataSet& in, index k = 15, index dims = 2, double minDist = 0.1,
index maxIter = 200, double learningRate = 1.0)
index maxIter = 200, double learningRate = 1.0, index seed = -1)
{
using namespace Eigen;
using namespace _impl;
using namespace std;

std::random_device rd;
std::mt19937_64 rng(seed < 0 ? rd() : seed);
SpectralEmbedding spectralEmbedding;
index n = in.size();
FluidTensor<string, 1> ids{in.getIds()};
Expand All @@ -193,7 +194,7 @@ class UMAP
knnGraph = (knnGraph + knnGraphT) - knnGraph.cwiseProduct(knnGraphT);
mAB = findAB(minDist);
mEmbedding = spectralEmbedding.train(knnGraph, dims);
mEmbedding = normalizeEmbedding(mEmbedding);
mEmbedding = normalizeEmbedding(mEmbedding, rng);
knnGraph.makeCompressed();
ArrayXi rowIndices(knnGraph.nonZeros());
ArrayXi colIndices(knnGraph.nonZeros());
Expand All @@ -202,16 +203,18 @@ class UMAP
computeEpochsPerSample(knnGraph, epochsPerSample);
epochsPerSample = (epochsPerSample == 0).select(-1, epochsPerSample);
optimizeLayoutAndUpdate(mEmbedding, mEmbedding, rowIndices, colIndices,
epochsPerSample, learningRate, maxIter);
epochsPerSample, learningRate, maxIter, rng);
DataSet out(ids, _impl::asFluid(mEmbedding));
mInitialized = true;
return out;
}

DataSet transform(DataSet& in, index maxIter = 200, double learningRate = 1.0) const
DataSet transform(DataSet& in, index maxIter = 200, double learningRate = 1.0, index seed = -1) const
{
if (!mInitialized) return DataSet();
SparseMatrixXd knnGraph(in.size(), mEmbedding.rows());
std::random_device rd;
std::mt19937_64 rng(seed < 0 ? rd() : seed);
SparseMatrixXd knnGraph(in.size(), mEmbedding.rows());
ArrayXXd dists = ArrayXXd::Zero(in.size(), mK);
makeGraph(in, mK, knnGraph, dists, false);
knnGraph.makeCompressed();
Expand All @@ -227,7 +230,7 @@ class UMAP
computeEpochsPerSample(knnGraph, epochsPerSample);
epochsPerSample = (epochsPerSample == 0).select(-1, epochsPerSample);
optimizeLayout(embedding, mEmbedding, rowIndices, colIndices,
epochsPerSample, learningRate, maxIter);
epochsPerSample, learningRate, maxIter, rng);
DataSet out(in.getIds(), _impl::asFluid(embedding));
return out;
}
Expand Down Expand Up @@ -370,19 +373,23 @@ class UMAP
for (size_t j = 0; j < asUnsigned(k); j++)
{
size_t pos = discardFirst ? j + 1 : j;
index neighborIndex = stoi(*nearestIds[pos]);
index neighborIndex = stoi(*nearestIds[pos]);//fixme: dep on string
dists(i, asSigned(j)) = distances[pos];
graph.insert(i, neighborIndex) = distances[pos];
}
}
}

ArrayXXd normalizeEmbedding(const Ref<ArrayXXd>& embedding)
template <typename RNG>
ArrayXXd normalizeEmbedding(const Ref<ArrayXXd>& embedding, RNG& rng)
{
// based on umap python implementation
double expansion = 10.0 / embedding.abs().maxCoeff();
ArrayXXd noise =
1e-4 * ArrayXXd::Random(embedding.rows(), embedding.cols()); // uniform
std::normal_distribution dist(0.0, 1e-4);

auto noise =
ArrayXXd::NullaryExpr(embedding.rows(), embedding.cols(),
[&dist, &rng]() { return dist(rng); });
ArrayXXd result = (embedding * expansion) + noise;
ArrayXd min = result.colwise().minCoeff();
ArrayXd max = result.colwise().maxCoeff();
Expand Down Expand Up @@ -415,25 +422,27 @@ class UMAP
});
}

template<typename RNG>
void optimizeLayoutAndUpdate(ArrayXXd& embedding, ArrayXXd& reference,
ArrayXi const& embIndices,
ArrayXi const& refIndices,
ArrayXd const& epochsPerSample,
double learningRate, index maxIter,
double learningRate, index maxIter,RNG& rng,
double gamma = 1.0)
{
impl::optimizeLayout(embedding, reference, embIndices, refIndices,
epochsPerSample, mAB, true, learningRate, maxIter,
epochsPerSample, mAB, true, learningRate, maxIter, rng,
gamma);
}

template<typename RNG>
void optimizeLayout(ArrayXXd& embedding, ArrayXXd const& reference,
ArrayXi const& embIndices, ArrayXi const& refIndices,
ArrayXd const& epochsPerSample, double learningRate,
index maxIter, double gamma = 1.0) const
index maxIter, RNG& rng, double gamma = 1.0) const
{
impl::optimizeLayout(embedding, reference, embIndices, refIndices,
epochsPerSample, mAB, false, learningRate, maxIter,
epochsPerSample, mAB, false, learningRate, maxIter, rng,
gamma);
}

Expand Down
21 changes: 12 additions & 9 deletions include/flucoma/clients/nrt/UMAPClient.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ constexpr auto UMAPParams = defineParameters(
LongParam("numNeighbours", "Number of Nearest Neighbours", 15, Min(1)),
FloatParam("minDist", "Minimum Distance", 0.1, Min(0)),
LongParam("iterations", "Number of Iterations", 200, Min(1)),
FloatParam("learnRate", "Learning Rate", 0.1, Min(0.0), Max(1.0)));
FloatParam("learnRate", "Learning Rate", 0.1, Min(0.0), Max(1.0)),
LongParam("seed", "Random Seed", -1));

class UMAPClient : public FluidBaseClient,
OfflineIn,
Expand All @@ -38,7 +39,8 @@ class UMAPClient : public FluidBaseClient,
kNumNeighbors,
kMinDistance,
kNumIter,
kLearningRate
kLearningRate,
kRandomSeed
};

public:
Expand Down Expand Up @@ -84,11 +86,11 @@ class UMAPClient : public FluidBaseClient,
FluidDataSet<string, double, 1> result;
try
{
result = mAlgorithm.train(src, get<kNumNeighbors>(), get<kNumDimensions>(),
get<kMinDistance>(), get<kNumIter>(),
get<kLearningRate>());
}
catch (const std::runtime_error& e) //spectra library will throw if eigen decomp fails
result = mAlgorithm.train(
src, get<kNumNeighbors>(), get<kNumDimensions>(), get<kMinDistance>(),
get<kNumIter>(), get<kLearningRate>(), get<kRandomSeed>());
} catch (const std::runtime_error&
e) // spectra library will throw if eigen decomp fails
{
return {Result::Status::kError, e.what()};
}
Expand All @@ -108,7 +110,7 @@ class UMAPClient : public FluidBaseClient,
FluidDataSet<string, double, 1> result;
result = mAlgorithm.train(src, get<kNumNeighbors>(), get<kNumDimensions>(),
get<kMinDistance>(), get<kNumIter>(),
get<kLearningRate>());
get<kLearningRate>(), get<kRandomSeed>());
return OK();
}

Expand All @@ -127,7 +129,8 @@ class UMAPClient : public FluidBaseClient,
if (src.pointSize() != mAlgorithm.inputDims()) return Error(WrongPointSize);
StringVector ids{src.getIds()};
FluidDataSet<string, double, 1> result;
result = mAlgorithm.transform(src, get<kNumIter>(), get<kLearningRate>());
result = mAlgorithm.transform(src, get<kNumIter>(), get<kLearningRate>(),
get<kRandomSeed>());
destPtr->setDataSet(result);
return OK();
}
Expand Down
2 changes: 2 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ add_test_executable(TestTransientSlice algorithms/public/TestTransientSlice.cpp)

add_test_executable(TestMLP algorithms/public/TestMLP.cpp)
add_test_executable(TestKMeans algorithms/public/TestKMeans.cpp)
add_test_executable(TestUMAP algorithms/public/TestUMAP.cpp)

target_link_libraries(TestNoveltySeg PRIVATE TestSignals)
target_link_libraries(TestOnsetSeg PRIVATE TestSignals)
Expand Down Expand Up @@ -153,5 +154,6 @@ catch_discover_tests(TestBufferedProcess WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
catch_discover_tests(TestMLP WORKING_DIRECTORY "${CMAKE_BINARY_DIR}")
catch_discover_tests(TestKMeans WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
catch_discover_tests(TestEigenRandom WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
catch_discover_tests(TestUMAP WORKING_DIRECTORY "${CMAKE_BINARY_DIR}")

add_compile_tests("FluidTensor Compilation Tests" data/compile_tests/TestFluidTensor_Compile.cpp)
53 changes: 53 additions & 0 deletions tests/algorithms/public/TestUMAP.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#define CATCH_CONFIG_MAIN

#include <catch2/catch_all.hpp>
#include <flucoma/algorithms/public/UMAP.hpp>
#include <flucoma/data/FluidDataSet.hpp>
#include <flucoma/data/FluidTensor.hpp>
#include <vector>

namespace fluid {
TEST_CASE("UMAP is repeatable with manually set seed")
{

using DataSet =
FluidDataSet<std::string, double, 1>; // boohoo for std::string nonsense
using Tensor = FluidTensor<double, 2>;
using IdList = FluidTensor<std::string, 1>;


algorithm::UMAP algo;

IdList ids(10); // don't need to care what's in here
Tensor d{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4},
{5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}};

// needs to be < num records, or segfault (maybe this should assert?)
index k = 3;
index dim = 2;
index iters = 1;
double minDist = 0.1;
double learnRate = 1.0;

DataSet input(ids, d);

std::vector embeddings(3, Tensor(d.rows(), dim));

algo.train(input, k, dim, minDist, iters, learnRate, 42);
algo.getEmbedding(embeddings[0]);

algo.clear();

algo.train(input, k, dim, minDist, iters, learnRate, 42);
algo.getEmbedding(embeddings[1]);

algo.clear();
algo.train(input, k, dim, minDist, iters, learnRate, 32498);

algo.getEmbedding(embeddings[2]);

using Catch::Matchers::RangeEquals;
REQUIRE_THAT(embeddings[1], RangeEquals(embeddings[0]));
REQUIRE_THAT(embeddings[1], !RangeEquals(embeddings[2]));
}
} // namespace fluid