diff --git a/include/flucoma/algorithms/public/UMAP.hpp b/include/flucoma/algorithms/public/UMAP.hpp index 3d262a4f..7d61d667 100644 --- a/include/flucoma/algorithms/public/UMAP.hpp +++ b/include/flucoma/algorithms/public/UMAP.hpp @@ -18,13 +18,13 @@ namespace fluid { namespace algorithm { namespace impl { -template +template void optimizeLayout(Eigen::ArrayXXd& embedding, RefeferenceArray& reference, Eigen::ArrayXi const& embIndices, Eigen::ArrayXi const& refIndices, Eigen::ArrayXd const& epochsPerSample, Eigen::VectorXd const& AB, bool updateReference, - double learningRate, index maxIter, double gamma = 1.0) + double learningRate, index maxIter, RNG& rng, double gamma = 1.0) { using namespace std; using namespace Eigen; @@ -33,8 +33,6 @@ void optimizeLayout(Eigen::ArrayXXd& embedding, RefeferenceArray& reference, auto distance = DistanceFuncs::map()[DistanceFuncs::Distance::kSqEuclidean]; double a = AB(0); double b = AB(1); - random_device rd; - mt19937 mt(rd()); uniform_int_distribution randomInt(0, reference.rows() - 1); ArrayXd epochsPerNegativeSample = epochsPerSample / negativeSampleRate; ArrayXd nextEpoch = epochsPerSample; @@ -65,7 +63,7 @@ void optimizeLayout(Eigen::ArrayXXd& embedding, RefeferenceArray& reference, epochsPerNegativeSample(j)); for (index k = 0; k < numNegative; k++) { - index negativeIndex = randomInt(mt); + index negativeIndex = randomInt(rng); if (negativeIndex == embIndices(j)) continue; ArrayXd negative = reference.row(negativeIndex); dist = distance(current, negative); @@ -129,7 +127,7 @@ class UMAP using ArrayXi = Eigen::ArrayXi; using VectorXd = Eigen::VectorXd; using SparseMatrixXd = Eigen::SparseMatrix; - using DataSet = FluidDataSet; + using DataSet = FluidDataSet; //fix me, this shouldn't care about id type template using Ref = Eigen::Ref; @@ -172,11 +170,14 @@ class UMAP bool initialized() const { return mInitialized; } DataSet train(DataSet& in, index k = 15, index dims = 2, double minDist = 0.1, - index maxIter = 200, double learningRate = 1.0) + index maxIter = 200, double learningRate = 1.0, index seed = -1) { using namespace Eigen; using namespace _impl; using namespace std; + + std::random_device rd; + std::mt19937_64 rng(seed < 0 ? rd() : seed); SpectralEmbedding spectralEmbedding; index n = in.size(); FluidTensor ids{in.getIds()}; @@ -193,7 +194,7 @@ class UMAP knnGraph = (knnGraph + knnGraphT) - knnGraph.cwiseProduct(knnGraphT); mAB = findAB(minDist); mEmbedding = spectralEmbedding.train(knnGraph, dims); - mEmbedding = normalizeEmbedding(mEmbedding); + mEmbedding = normalizeEmbedding(mEmbedding, rng); knnGraph.makeCompressed(); ArrayXi rowIndices(knnGraph.nonZeros()); ArrayXi colIndices(knnGraph.nonZeros()); @@ -202,16 +203,18 @@ class UMAP computeEpochsPerSample(knnGraph, epochsPerSample); epochsPerSample = (epochsPerSample == 0).select(-1, epochsPerSample); optimizeLayoutAndUpdate(mEmbedding, mEmbedding, rowIndices, colIndices, - epochsPerSample, learningRate, maxIter); + epochsPerSample, learningRate, maxIter, rng); DataSet out(ids, _impl::asFluid(mEmbedding)); mInitialized = true; return out; } - DataSet transform(DataSet& in, index maxIter = 200, double learningRate = 1.0) const + DataSet transform(DataSet& in, index maxIter = 200, double learningRate = 1.0, index seed = -1) const { if (!mInitialized) return DataSet(); - SparseMatrixXd knnGraph(in.size(), mEmbedding.rows()); + std::random_device rd; + std::mt19937_64 rng(seed < 0 ? rd() : seed); + SparseMatrixXd knnGraph(in.size(), mEmbedding.rows()); ArrayXXd dists = ArrayXXd::Zero(in.size(), mK); makeGraph(in, mK, knnGraph, dists, false); knnGraph.makeCompressed(); @@ -227,7 +230,7 @@ class UMAP computeEpochsPerSample(knnGraph, epochsPerSample); epochsPerSample = (epochsPerSample == 0).select(-1, epochsPerSample); optimizeLayout(embedding, mEmbedding, rowIndices, colIndices, - epochsPerSample, learningRate, maxIter); + epochsPerSample, learningRate, maxIter, rng); DataSet out(in.getIds(), _impl::asFluid(embedding)); return out; } @@ -370,19 +373,23 @@ class UMAP for (size_t j = 0; j < asUnsigned(k); j++) { size_t pos = discardFirst ? j + 1 : j; - index neighborIndex = stoi(*nearestIds[pos]); + index neighborIndex = stoi(*nearestIds[pos]);//fixme: dep on string dists(i, asSigned(j)) = distances[pos]; graph.insert(i, neighborIndex) = distances[pos]; } } } - ArrayXXd normalizeEmbedding(const Ref& embedding) + template + ArrayXXd normalizeEmbedding(const Ref& embedding, RNG& rng) { // based on umap python implementation double expansion = 10.0 / embedding.abs().maxCoeff(); - ArrayXXd noise = - 1e-4 * ArrayXXd::Random(embedding.rows(), embedding.cols()); // uniform + std::normal_distribution dist(0.0, 1e-4); + + auto noise = + ArrayXXd::NullaryExpr(embedding.rows(), embedding.cols(), + [&dist, &rng]() { return dist(rng); }); ArrayXXd result = (embedding * expansion) + noise; ArrayXd min = result.colwise().minCoeff(); ArrayXd max = result.colwise().maxCoeff(); @@ -415,25 +422,27 @@ class UMAP }); } + template void optimizeLayoutAndUpdate(ArrayXXd& embedding, ArrayXXd& reference, ArrayXi const& embIndices, ArrayXi const& refIndices, ArrayXd const& epochsPerSample, - double learningRate, index maxIter, + double learningRate, index maxIter,RNG& rng, double gamma = 1.0) { impl::optimizeLayout(embedding, reference, embIndices, refIndices, - epochsPerSample, mAB, true, learningRate, maxIter, + epochsPerSample, mAB, true, learningRate, maxIter, rng, gamma); } + template void optimizeLayout(ArrayXXd& embedding, ArrayXXd const& reference, ArrayXi const& embIndices, ArrayXi const& refIndices, ArrayXd const& epochsPerSample, double learningRate, - index maxIter, double gamma = 1.0) const + index maxIter, RNG& rng, double gamma = 1.0) const { impl::optimizeLayout(embedding, reference, embIndices, refIndices, - epochsPerSample, mAB, false, learningRate, maxIter, + epochsPerSample, mAB, false, learningRate, maxIter, rng, gamma); } diff --git a/include/flucoma/clients/nrt/UMAPClient.hpp b/include/flucoma/clients/nrt/UMAPClient.hpp index 593104a9..e97bb600 100644 --- a/include/flucoma/clients/nrt/UMAPClient.hpp +++ b/include/flucoma/clients/nrt/UMAPClient.hpp @@ -24,7 +24,8 @@ constexpr auto UMAPParams = defineParameters( LongParam("numNeighbours", "Number of Nearest Neighbours", 15, Min(1)), FloatParam("minDist", "Minimum Distance", 0.1, Min(0)), LongParam("iterations", "Number of Iterations", 200, Min(1)), - FloatParam("learnRate", "Learning Rate", 0.1, Min(0.0), Max(1.0))); + FloatParam("learnRate", "Learning Rate", 0.1, Min(0.0), Max(1.0)), + LongParam("seed", "Random Seed", -1)); class UMAPClient : public FluidBaseClient, OfflineIn, @@ -38,7 +39,8 @@ class UMAPClient : public FluidBaseClient, kNumNeighbors, kMinDistance, kNumIter, - kLearningRate + kLearningRate, + kRandomSeed }; public: @@ -84,11 +86,11 @@ class UMAPClient : public FluidBaseClient, FluidDataSet result; try { - result = mAlgorithm.train(src, get(), get(), - get(), get(), - get()); - } - catch (const std::runtime_error& e) //spectra library will throw if eigen decomp fails + result = mAlgorithm.train( + src, get(), get(), get(), + get(), get(), get()); + } catch (const std::runtime_error& + e) // spectra library will throw if eigen decomp fails { return {Result::Status::kError, e.what()}; } @@ -108,7 +110,7 @@ class UMAPClient : public FluidBaseClient, FluidDataSet result; result = mAlgorithm.train(src, get(), get(), get(), get(), - get()); + get(), get()); return OK(); } @@ -127,7 +129,8 @@ class UMAPClient : public FluidBaseClient, if (src.pointSize() != mAlgorithm.inputDims()) return Error(WrongPointSize); StringVector ids{src.getIds()}; FluidDataSet result; - result = mAlgorithm.transform(src, get(), get()); + result = mAlgorithm.transform(src, get(), get(), + get()); destPtr->setDataSet(result); return OK(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 74ec9efa..d233826a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -116,6 +116,7 @@ add_test_executable(TestTransientSlice algorithms/public/TestTransientSlice.cpp) add_test_executable(TestMLP algorithms/public/TestMLP.cpp) add_test_executable(TestKMeans algorithms/public/TestKMeans.cpp) +add_test_executable(TestUMAP algorithms/public/TestUMAP.cpp) target_link_libraries(TestNoveltySeg PRIVATE TestSignals) target_link_libraries(TestOnsetSeg PRIVATE TestSignals) @@ -153,5 +154,6 @@ catch_discover_tests(TestBufferedProcess WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" catch_discover_tests(TestMLP WORKING_DIRECTORY "${CMAKE_BINARY_DIR}") catch_discover_tests(TestKMeans WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) catch_discover_tests(TestEigenRandom WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +catch_discover_tests(TestUMAP WORKING_DIRECTORY "${CMAKE_BINARY_DIR}") add_compile_tests("FluidTensor Compilation Tests" data/compile_tests/TestFluidTensor_Compile.cpp) diff --git a/tests/algorithms/public/TestUMAP.cpp b/tests/algorithms/public/TestUMAP.cpp new file mode 100644 index 00000000..81979125 --- /dev/null +++ b/tests/algorithms/public/TestUMAP.cpp @@ -0,0 +1,53 @@ +#define CATCH_CONFIG_MAIN + +#include +#include +#include +#include +#include + +namespace fluid { +TEST_CASE("UMAP is repeatable with manually set seed") +{ + + using DataSet = + FluidDataSet; // boohoo for std::string nonsense + using Tensor = FluidTensor; + using IdList = FluidTensor; + + + algorithm::UMAP algo; + + IdList ids(10); // don't need to care what's in here + Tensor d{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, + {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}}; + + // needs to be < num records, or segfault (maybe this should assert?) + index k = 3; + index dim = 2; + index iters = 1; + double minDist = 0.1; + double learnRate = 1.0; + + DataSet input(ids, d); + + std::vector embeddings(3, Tensor(d.rows(), dim)); + + algo.train(input, k, dim, minDist, iters, learnRate, 42); + algo.getEmbedding(embeddings[0]); + + algo.clear(); + + algo.train(input, k, dim, minDist, iters, learnRate, 42); + algo.getEmbedding(embeddings[1]); + + algo.clear(); + algo.train(input, k, dim, minDist, iters, learnRate, 32498); + + algo.getEmbedding(embeddings[2]); + + using Catch::Matchers::RangeEquals; + REQUIRE_THAT(embeddings[1], RangeEquals(embeddings[0])); + REQUIRE_THAT(embeddings[1], !RangeEquals(embeddings[2])); +} +} // namespace fluid \ No newline at end of file