diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 1e87782..d91d4a5 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -4,9 +4,11 @@ on: push: branches: - main + - dev pull_request: branches: - main + - dev workflow_dispatch: jobs: @@ -30,6 +32,9 @@ jobs: - name: Build run: dotnet build --configuration Release --no-restore + + - name: Performance Test + run: dotnet run --project SharpVectorPerformance --configuration Release # - name: Publish # run: dotnet publish --configuration Release --output ./publish --no-build @@ -40,7 +45,18 @@ jobs: # name: release-build # path: ./publish - - name: Upload artifact + - name: Performance Results + run: | + echo "## Performance Results" > $GITHUB_STEP_SUMMARY + cat ./BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY + + - name: Upload Performance artifact + uses: actions/upload-artifact@v4 + with: + name: performance-results + path: './src/BenchmarkDotNet.Artifacts/*' + + - name: Upload Nuget artifact uses: actions/upload-artifact@v4 with: name: nuget-package diff --git a/.github/workflows/dotnet-tests.yml b/.github/workflows/dotnet-tests.yml index 7f283aa..5c655cd 100644 --- a/.github/workflows/dotnet-tests.yml +++ b/.github/workflows/dotnet-tests.yml @@ -1,9 +1,14 @@ name: .NET Core Tests on: + push: + branches: + - main + - dev pull_request: branches: - main + - dev workflow_dispatch: jobs: @@ -28,5 +33,11 @@ jobs: - name: Build run: dotnet build --no-restore - - name: Run tests - run: dotnet test --no-build --verbosity normal \ No newline at end of file + - name: Run tests with code coverage + run: dotnet test --no-build --verbosity normal --results-directory "./TestResults/Coverage/" --collect:"XPlat Code Coverage" + + - name: Upload test results artifact + uses: actions/upload-artifact@v4 + with: + name: test-results + path: '**/TestResults/**' diff --git a/.gitignore b/.gitignore index 326e77b..a787c7d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ obj bin .DS_Store + +BenchmarkDotNet.Artifacts/ +TestResults/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5cb7c2a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,66 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## v2.0.0 + +Added: + +- Add data persistence capability to save/load from a file or to/from a `Stream` (Both SharpVector and SharpVector.OpenAI) +- Add Chinese language/character support + +Breaking Change: + +- Refactor `IVocabularyStore` to be used within `MemoryDictionaryVectorStoreWithVocabulary`. This simplifies implementation of `MemoryVectorDatabaseBase`, and helps to enable data persistence capability. + +Notes: + +- The breaking change only applies if the base classes are being used. If the `BasicMemoryVectorDatabase` is being used, this will likely not break applications that depend on this library. However, in some instances where explicitly depending on `VectorTextResult` it's properties (without using `var` in consuming code) there might be minor code changes needed when migrating from previous versions of the library. + +## v1.0.1 (2025-02-06) + +- Upgrade to .NET 8 or higher + +### v1.0.0 (2024-05-24) + +Added: + +- Simplify object model by combining Async and non-Async classes, `BasicMemoryVectorDatabase` now support both synchronous and asynchronous operations. +- Refactored to remove unnecessary classes where the `Async` versions will work just fine. +- Improve async/await and multi-threading use + +### v0.9.8-beta (2024-05-20) + +Added: + +- Added `Async` version of classes to support multi-threading +- Metadata is no longer required when calling `.AddText()` and `.AddTextAsync()` +- Refactor `IVectorSimilarityCalculator` to `IVectorComparer` and `CosineVectorSimilarityCalculatorAsync` to `CosineSimilarityVectorComparerAsync` +- Add new `EuclideanDistanceVectorComparerAsync` +- Fix `MemoryVectorDatabase` to no longer requird unused `TId` generic type +- Rename `VectorSimilarity` and `Similarity` properties to `VectorComparison` + +### v0.9.5-beta (2024-05-18) + +Added: + +- Add `TextDataLoader` class to provide support for different methods of text chunking when loading documents into the vector database. + +### v0.9.0-beta (2024-05-18) + +Added: + +- Introduced the `BasicMemoryVectorDatabase` class as the basic Vector Database implementations that uses a Bag of Words vectorization strategy, with Cosine similarity, a dictionary vocabulary store, and a basic text preprocessor. +- Add more C# Generics use, so the library is more customizable when used, and custom vector databases can be implemented if desired. +- Added `VectorTextResultItem.Similarity` so consuming code can inspect similarity of the Text in the vector search results. +- Update `.Search` method to support search result paging and threshold support for similarity comparison +- Add some basic Unit Tests + +### v0.8.0-beta (2024-05-17) + +Added: + +- Initial release - let's do this! diff --git a/README.md b/README.md index 572da8e..5b3e71c 100644 --- a/README.md +++ b/README.md @@ -138,54 +138,6 @@ Here's a screenshot of the test console app running: ![](assets/build5nines-sharpvector-console-screenshot.jpg) -## Change Log - -## v2.0.0 (In Progress) - -Feature: -- Add data persistence capability - -Breaking Change: -- Refactor `IVocabularyStore` to be used within `MemoryDictionaryVectorStoreWithVocabulary`. This simplifies implementation of `MemoryVectorDatabaseBase`, and helps to enable data persistence capability. - -Notes: -- The breaking change only applies if the base classes are being used. If the `BasicMemoryVectorDatabase` is being used, this will likely not break applications that depend on this library. However, in some instances where explicitly depending on `VectorTextResult` it's properties (without using `var` in consuming code) there might be minor code changes needed when migrating from previous versions of the library. - -## v1.0.1 (2025-02-06) - -- Upgrade to .NET 8 or higher - -### v1.0.0 (2024-05-24) - -- Simplify object model by combining Async and non-Async classes, `BasicMemoryVectorDatabase` now support both synchronous and asynchronous operations. -- Refactored to remove unnecessary classes where the `Async` versions will work just fine. -- Improve async/await and multi-threading use - -### v0.9.8-beta (2024-05-20) - -- Added `Async` version of classes to support multi-threading -- Metadata is no longer required when calling `.AddText()` and `.AddTextAsync()` -- Refactor `IVectorSimilarityCalculator` to `IVectorComparer` and `CosineVectorSimilarityCalculatorAsync` to `CosineSimilarityVectorComparerAsync` -- Add new `EuclideanDistanceVectorComparerAsync` -- Fix `MemoryVectorDatabase` to no longer requird unused `TId` generic type -- Rename `VectorSimilarity` and `Similarity` properties to `VectorComparison` - -### v0.9.5-beta (2024-05-18) - -- Add `TextDataLoader` class to provide support for different methods of text chunking when loading documents into the vector database. - -### v0.9.0-beta (2024-05-18) - -- Introduced the `BasicMemoryVectorDatabase` class as the basic Vector Database implementations that uses a Bag of Words vectorization strategy, with Cosine similarity, a dictionary vocabulary store, and a basic text preprocessor. -- Add more C# Generics use, so the library is more customizable when used, and custom vector databases can be implemented if desired. -- Added `VectorTextResultItem.Similarity` so consuming code can inspect similarity of the Text in the vector search results. -- Update `.Search` method to support search result paging and threshold support for similarity comparison -- Add some basic Unit Tests - -### v0.8.0-beta (2024-05-17) - -- Initial release - let's do this! - ## Maintained By The **Build5Nines SharpVector** project is maintained by [Chris Pietschmann](https://pietschsoft.com?utm_source=github&utm_medium=sharpvector), founder of [Build5Nines](https://build5nines.com?utm_source=github&utm_medium=sharpvector), Microsoft MVP, HashiCorp Ambassador, and Microsoft Certified Trainer (MCT). diff --git a/samples/genai-rag-onnx/Program.cs b/samples/genai-rag-onnx/Program.cs index 0933bc0..5d3471f 100644 --- a/samples/genai-rag-onnx/Program.cs +++ b/samples/genai-rag-onnx/Program.cs @@ -162,11 +162,12 @@ static async Task Main(string[] args) var generatorParams = new GeneratorParams(model); generatorParams.SetSearchOption("max_length", maxPromptLength); generatorParams.SetSearchOption("past_present_share_buffer", false); - generatorParams.SetInputSequences(tokens); + //generatorParams.SetInputSequences(tokens); // Generate the response Console.WriteLine("AI is thinking..."); var generator = new Generator(model, generatorParams); + generator.AppendTokenSequences(tokens); // show in console that the assistant is responding Console.WriteLine(""); @@ -174,7 +175,7 @@ static async Task Main(string[] args) // Output response as each token in generated while (!generator.IsDone()) { - generator.ComputeLogits(); + //generator.ComputeLogits(); generator.GenerateNextToken(); var output = GetOutputTokens(generator, tokenizer); Console.Write(output); diff --git a/samples/genai-rag-onnx/genai-rag-onnx.csproj b/samples/genai-rag-onnx/genai-rag-onnx.csproj index 20adb58..8bdbc09 100644 --- a/samples/genai-rag-onnx/genai-rag-onnx.csproj +++ b/samples/genai-rag-onnx/genai-rag-onnx.csproj @@ -8,11 +8,11 @@ enable - - - - - + + + + + diff --git a/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj b/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj index 588c6a5..9579dcb 100644 --- a/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj +++ b/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj @@ -24,7 +24,7 @@ - + diff --git a/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs index 735c0bb..3c59866 100644 --- a/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs @@ -3,13 +3,15 @@ using Build5Nines.SharpVector.VectorStore; using System.Collections.Concurrent; using OpenAI.Embeddings; +using System.IO.Compression; +using System.Text.Json; namespace Build5Nines.SharpVector.OpenAI; public abstract class OpenAIMemoryVectorDatabaseBase : IVectorDatabase where TId : notnull - where TVectorStore : IVectorStore + where TVectorStore : IVectorStore where TIdGenerator : IIdGenerator, new() where TVectorComparer : IVectorComparer, new() { @@ -76,7 +78,7 @@ public IEnumerable GetIds() /// /// /// - public IVectorTextItem GetText(TId id) + public IVectorTextItem GetText(TId id) { return VectorStore.Get(id); } @@ -86,7 +88,7 @@ public IVectorTextItem GetText(TId id) /// /// /// - public IVectorTextItem DeleteText(TId id) + public IVectorTextItem DeleteText(TId id) { return VectorStore.Delete(id); } @@ -193,7 +195,7 @@ public void UpdateTextAndMetadata(TId id, string text, TMetadata metadata) /// The highest number of results to show. /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. /// - public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) { return SearchAsync(queryText, threshold, pageIndex, pageCount).Result; } @@ -206,7 +208,7 @@ public IVectorTextResult Search(string queryText, float? threshold = /// The page index of the search results. Default is 0. /// The number of search results per page. Default is Null and returns all results. /// - public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) { var similarities = await CalculateVectorComparisonAsync(queryText, threshold); @@ -248,4 +250,137 @@ private async Task>> CalculateVector } return results; } + + /// + /// Serializes the Vector Database to a JSON stream + /// + /// + /// + /// + public virtual async Task SerializeToJsonStreamAsync(Stream stream) + { + var streamVectorStore = new MemoryStream(); + var streamVocabularyStore = new MemoryStream(); + + await VectorStore.SerializeToJsonStreamAsync(streamVectorStore); + + using (var archive = new ZipArchive(stream, ZipArchiveMode.Create, true)) + { + var entryDatabaseType = archive.CreateEntry("database.json"); + using (var entryStream = entryDatabaseType.Open()) + { + var databaseInfo = new DatabaseInfo(this.GetType().FullName); + + var databaseInfoJson = JsonSerializer.Serialize(databaseInfo); + + if (databaseInfoJson != null) + { + var databaseTypeBytes = System.Text.Encoding.UTF8.GetBytes(databaseInfoJson); + await entryStream.WriteAsync(databaseTypeBytes); + await entryStream.FlushAsync(); + } + else + { + throw new InvalidOperationException("Type name cannot be null."); + } + } + var entryVectorStore = archive.CreateEntry("vectorstore.json"); + using (var entryStream = entryVectorStore.Open()) + { + streamVectorStore.Position = 0; + await streamVectorStore.CopyToAsync(entryStream); + await entryStream.FlushAsync(); + } + } + + await stream.FlushAsync(); + } + + public virtual void SerializeToJsonStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + SerializeToJsonStreamAsync(stream).Wait(); + } + + public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + + using (var archive = new ZipArchive(stream, ZipArchiveMode.Read)) + { + var entryDatabaseType = archive.GetEntry("database.json"); + if (entryDatabaseType != null) + { + using (var entryStream = entryDatabaseType.Open()) + { + var databaseTypeStream = new MemoryStream(); + await entryStream.CopyToAsync(databaseTypeStream); + databaseTypeStream.Position = 0; + + var databaseTypeBytes = new byte[databaseTypeStream.Length]; + await databaseTypeStream.ReadAsync(databaseTypeBytes); + var databaseInfoJson = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + + var databaseInfo = JsonSerializer.Deserialize(databaseInfoJson); + + string SupportedVersion = "1.0.0"; + string SupportedSchema = "Build5Nines.SharpVector"; + + if (databaseInfo == null) + { + throw new DatabaseFileInfoException("Database info entry is null."); + } + + if (databaseInfo.Schema != SupportedSchema) + { + throw new DatabaseFileSchemaException($"The database schema does not match the expected schema (Expected: {SupportedSchema} - Actual: {databaseInfo.Schema})."); + } + + if (databaseInfo.Version != SupportedVersion) + { + throw new DatabaseFileVersionException($"The database version does not match the expected version (Expected: {SupportedVersion} - Actual: {databaseInfo.Version})."); + } + + if (databaseInfo.ClassType != this.GetType().FullName) + { + throw new DatabaseFileClassTypeException($"The database class type does not match the expected type (Expected: {this.GetType().FullName} - Actual: {databaseInfo.ClassType})"); + } + } + } + else + { + throw new DatabaseFileMissingEntryException("Database info entry not found.", "database"); + } + + + var entryVectorStore = archive.GetEntry("vectorstore.json"); + if (entryVectorStore != null) + { + using (var entryStream = entryVectorStore.Open()) + { + await VectorStore.DeserializeFromJsonStreamAsync(entryStream); + } + } + else + { + throw new DatabaseFileMissingEntryException("Vector Store entry not found.", "vectorstore"); + } + } + } + + public virtual void DeserializeFromJsonStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + DeserializeFromJsonStreamAsync(stream).Wait(); + } + } \ No newline at end of file diff --git a/src/Build5Nines.SharpVector.OpenAI/docs/README.md b/src/Build5Nines.SharpVector.OpenAI/docs/README.md index e097632..760ee5b 100644 --- a/src/Build5Nines.SharpVector.OpenAI/docs/README.md +++ b/src/Build5Nines.SharpVector.OpenAI/docs/README.md @@ -1,3 +1,7 @@ Build5Nines.SharpVector.OpenAI is a lightweight in-memory Vector Database for use in any .NET application that connects to an embeddings model running in Azure OpenAI for generating the text embeddings. The `Build5Nines.SharpVector.OpenAI.BasicOpenAIMemoryVectorDatabase` class uses an OpenAI Embeddings Client with Cosine similarity search. + +## Tutorials + +- [Enhanced In-Memory Text Vector Search in .NET with SharpVector and OpenAI Embeddings](https://build5nines.com/enhanced-in-memory-text-vector-search-in-net-with-sharpvector-and-openai-embeddings/?utm_source=github&utm_medium=sharpvector) by Chris Pietschmann \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj index 196bfc4..357d821 100644 --- a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj +++ b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj @@ -16,7 +16,7 @@ https://github.com/Build5Nines/SharpVector/blob/main/LICENSE Chris Pietschmann Build5Nines LLC - vector;search;database;data;rag + vector;search;database;data;rag;search;llm;generative ai;ai;genai diff --git a/src/Build5Nines.SharpVector/DatabaseFile.cs b/src/Build5Nines.SharpVector/DatabaseFile.cs new file mode 100644 index 0000000..142d7e9 --- /dev/null +++ b/src/Build5Nines.SharpVector/DatabaseFile.cs @@ -0,0 +1,152 @@ + +using System.IO.Compression; +using System.Text.Json; + +namespace Build5Nines.SharpVector; + +public static class DatabaseFile +{ + /// + /// Load the vector database from a stream + /// + /// + /// + /// + public static async Task> Load(Stream stream) + { + return await Load, TMetadata>(stream); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + /// + public static async Task Load(Stream stream) + where TVectorDatabase : MemoryVectorDatabase, new() + { + return await Load(stream); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + public static async Task Load(Stream stream) + where TVectorDatabase : IVectorDatabase, new() + where TId : notnull + { + var vdb = new TVectorDatabase(); + return await Load(vdb, stream); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + public static async Task Load(TVectorDatabase vdb, Stream stream) + where TVectorDatabase : IVectorDatabase + where TId : notnull + { + await vdb.DeserializeFromJsonStreamAsync(stream); + return vdb; + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + public static async Task> Load(string filePath) + { + return await Load, TMetadata>(filePath); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + /// + public static async Task Load(string filePath) + where TVectorDatabase : MemoryVectorDatabase, new() + { + return await Load(filePath); + } + + /// + /// Load the vector database from a file + /// + /// + /// + public static async Task Load(string filePath) + where TVectorDatabase: IVectorDatabase, new() + where TId : notnull + { + var vdb = new TVectorDatabase(); + return await Load(vdb, filePath); + } + + /// + /// Load the vector database from a file + /// + /// + /// + /// + public static async Task Load(TVectorDatabase vdb, string filePath) + where TVectorDatabase : IVectorDatabase + where TId : notnull + { + await vdb.LoadFromFileAsync(filePath); + return vdb; + } + + public static async Task LoadDatabaseInfoAsync(Stream stream) + { + using (var archive = new ZipArchive(stream, ZipArchiveMode.Read)) + { + var entryDatabaseType = archive.GetEntry("database.json"); + if (entryDatabaseType != null) + { + using (var entryStream = entryDatabaseType.Open()) + { + var databaseTypeStream = new MemoryStream(); + await entryStream.CopyToAsync(databaseTypeStream); + databaseTypeStream.Position = 0; + + var databaseTypeBytes = new byte[databaseTypeStream.Length]; + await databaseTypeStream.ReadAsync(databaseTypeBytes); + var databaseInfoJson = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + + var databaseInfo = JsonSerializer.Deserialize(databaseInfoJson); + + if (databaseInfo == null) + { + throw new DatabaseFileInfoException("Database info entry is null."); + } + + return databaseInfo; + } + } + else + { + throw new DatabaseFileMissingEntryException("Database info entry not found.", "database"); + } + } + } + + public static async Task LoadDatabaseInfoAsync(string filePath) + { + using (var stream = File.OpenRead(filePath)) + { + return await LoadDatabaseInfoAsync(stream); + } + } +} \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/DatabaseFileException.cs b/src/Build5Nines.SharpVector/DatabaseFileException.cs new file mode 100644 index 0000000..b5468af --- /dev/null +++ b/src/Build5Nines.SharpVector/DatabaseFileException.cs @@ -0,0 +1,97 @@ +namespace Build5Nines.SharpVector; + +public class DatabaseFileException : Exception +{ + public DatabaseFileException() + { + } + + public DatabaseFileException(string message) + : base(message) + { + } + + public DatabaseFileException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileInfoException : DatabaseFileException +{ + public DatabaseFileInfoException() + { + } + + public DatabaseFileInfoException(string message) + : base(message) + { + } + + public DatabaseFileInfoException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileSchemaException : DatabaseFileException +{ + public DatabaseFileSchemaException() + { + } + + public DatabaseFileSchemaException(string message) + : base(message) + { + } + + public DatabaseFileSchemaException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileVersionException : DatabaseFileException +{ + public DatabaseFileVersionException() + { + } + + public DatabaseFileVersionException(string message) + : base(message) + { + } + + public DatabaseFileVersionException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileClassTypeException : DatabaseFileException +{ + public DatabaseFileClassTypeException() + { + } + + public DatabaseFileClassTypeException(string message) + : base(message) + { + } + + public DatabaseFileClassTypeException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileMissingEntryException : DatabaseFileException +{ + public DatabaseFileMissingEntryException(string message, string missingEntry) + : base(message) + { + MissingEntry = missingEntry; + } + + public string MissingEntry { get; private set; } +} \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/DatabaseInfo.cs b/src/Build5Nines.SharpVector/DatabaseInfo.cs new file mode 100644 index 0000000..ccd8866 --- /dev/null +++ b/src/Build5Nines.SharpVector/DatabaseInfo.cs @@ -0,0 +1,25 @@ +namespace Build5Nines.SharpVector; + +public class DatabaseInfo +{ + internal static string SupportedVersion = "1.0.0"; + internal static string SupportedSchema = "Build5Nines.SharpVector"; + + public DatabaseInfo() + : this(null, null, null) + { } + public DatabaseInfo(string? classType) + : this(SupportedSchema, SupportedVersion, classType) + { } + + public DatabaseInfo(string? schema, string? version, string? classType) + { + Schema = schema; + Version = version; + ClassType = classType; + } + + public string? Schema { get; set; } + public string? Version { get; set; } + public string? ClassType { get; set; } +} \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs b/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs index 83b9bcf..13a1ce4 100644 --- a/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs +++ b/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs @@ -2,37 +2,37 @@ namespace Build5Nines.SharpVector; public static class IVectorDatabaseExtensions { - public static async Task SaveToFileAsync(this IVectorDatabase vectorDatabase, string filename) + public static async Task SaveToFileAsync(this IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Create, FileAccess.Write)) + using (var stream = new FileStream(filePath, FileMode.Create, FileAccess.Write)) { await vectorDatabase.SerializeToJsonStreamAsync(stream); } } - public static void SaveToFile(this IVectorDatabase vectorDatabase, string filename) + public static void SaveToFile(this IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Create, FileAccess.Write)) + using (var stream = new FileStream(filePath, FileMode.Create, FileAccess.Write)) { vectorDatabase.SerializeToJsonStream(stream); } } - public static async Task LoadFromFileAsync(this IVectorDatabase vectorDatabase, string filename) + public static async Task LoadFromFileAsync(this IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + using (var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) { await vectorDatabase.DeserializeFromJsonStreamAsync(stream); } } - public static void LoadFromFile(IVectorDatabase vectorDatabase, string filename) + public static void LoadFromFile(IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + using (var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) { vectorDatabase.DeserializeFromJsonStream(stream); } diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs index 252a2b4..479a179 100644 --- a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs @@ -7,6 +7,7 @@ using System.Collections.Concurrent; using System.IO.Compression; using System.Runtime.CompilerServices; +using System.Text.Json; namespace Build5Nines.SharpVector; @@ -254,13 +255,16 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) using (var archive = new ZipArchive(stream, ZipArchiveMode.Create, true)) { - var entryDatabaseType = archive.CreateEntry("DatabaseType.txt"); + var entryDatabaseType = archive.CreateEntry("database.json"); using (var entryStream = entryDatabaseType.Open()) { - var typeName = this.GetType().FullName; - if (typeName != null) + var databaseInfo = new DatabaseInfo(this.GetType().FullName); + + var databaseInfoJson = JsonSerializer.Serialize(databaseInfo); + + if (databaseInfoJson != null) { - var databaseTypeBytes = System.Text.Encoding.UTF8.GetBytes(typeName); + var databaseTypeBytes = System.Text.Encoding.UTF8.GetBytes(databaseInfoJson); await entryStream.WriteAsync(databaseTypeBytes); await entryStream.FlushAsync(); } @@ -269,7 +273,7 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) throw new InvalidOperationException("Type name cannot be null."); } } - var entryVectorStore = archive.CreateEntry("VectorStore.json"); + var entryVectorStore = archive.CreateEntry("vectorstore.json"); using (var entryStream = entryVectorStore.Open()) { streamVectorStore.Position = 0; @@ -277,7 +281,7 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) await entryStream.FlushAsync(); } - var entryVocabularyStore = archive.CreateEntry("VocabularyStore.json"); + var entryVocabularyStore = archive.CreateEntry("vocabularystore.json"); using (var entryStream = entryVocabularyStore.Open()) { streamVocabularyStore.Position = 0; @@ -307,7 +311,7 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) using (var archive = new ZipArchive(stream, ZipArchiveMode.Read)) { - var entryDatabaseType = archive.GetEntry("DatabaseType.txt"); + var entryDatabaseType = archive.GetEntry("database.json"); if (entryDatabaseType != null) { using (var entryStream = entryDatabaseType.Open()) @@ -318,20 +322,38 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) var databaseTypeBytes = new byte[databaseTypeStream.Length]; await databaseTypeStream.ReadAsync(databaseTypeBytes); - var databaseType = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + var databaseInfoJson = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + + var databaseInfo = JsonSerializer.Deserialize(databaseInfoJson); + + if (databaseInfo == null) + { + throw new DatabaseFileInfoException("Database info entry is null."); + } + + if (databaseInfo.Schema != DatabaseInfo.SupportedSchema) + { + throw new DatabaseFileSchemaException($"The database schema does not match the expected schema (Expected: {DatabaseInfo.SupportedSchema} - Actual: {databaseInfo.Schema})."); + } - if (databaseType != this.GetType().FullName) + if (databaseInfo.Version != DatabaseInfo.SupportedVersion) { - throw new InvalidOperationException($"The database type does not match the expected type [Expected: {databaseType}] "); + throw new DatabaseFileVersionException($"The database version does not match the expected version (Expected: {DatabaseInfo.SupportedVersion} - Actual: {databaseInfo.Version})."); + } + + if (databaseInfo.ClassType != this.GetType().FullName) + { + throw new DatabaseFileClassTypeException($"The database class type does not match the expected type (Expected: {this.GetType().FullName} - Actual: {databaseInfo.ClassType})"); } } } else { - throw new InvalidOperationException("Database type entry not found."); + throw new DatabaseFileMissingEntryException("Database info entry not found.", "database"); } - var entryVectorStore = archive.GetEntry("VectorStore.json"); + + var entryVectorStore = archive.GetEntry("vectorstore.json"); if (entryVectorStore != null) { using (var entryStream = entryVectorStore.Open()) @@ -341,10 +363,10 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) } else { - throw new InvalidOperationException("Vector Store entry not found."); + throw new DatabaseFileMissingEntryException("Vector Store entry not found.", "vectorstore"); } - var entryVocabularyStore = archive.GetEntry("VocabularyStore.json"); + var entryVocabularyStore = archive.GetEntry("vocabularystore.json"); if (entryVocabularyStore != null) { using (var entryStream = entryVocabularyStore.Open()) @@ -354,8 +376,9 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) } else { - throw new InvalidOperationException("Vocabulary Store entry not found."); + throw new DatabaseFileMissingEntryException("Vocabulary Store entry not found.", "vocabularystore"); } + } } diff --git a/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs b/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs index 3b878a9..ba94a5e 100644 --- a/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs +++ b/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs @@ -8,9 +8,24 @@ public class BasicTextPreprocessor : ITextPreprocessor public IEnumerable TokenizeAndPreprocess(string text) { text = text.ToLower(); - text = Regex.Replace(text, @"[^\w\s]", ""); - text = Regex.Replace(text, @"\s+", " ").Trim(); - return text.Split(' ').ToList(); + + // Check if text contains Chinese characters using the CJK Unified Ideographs block + if (Regex.IsMatch(text, @"\p{IsCJKUnifiedIdeographs}")) + { + // Remove punctuation (excluding Chinese characters) + text = Regex.Replace(text, @"[^\p{IsCJKUnifiedIdeographs}\w\s]", ""); + // Tokenize either by matching individual Chinese characters or contiguous word tokens (for Latin letters/digits) + var tokens = Regex.Matches(text, @"[\p{IsCJKUnifiedIdeographs}]|[a-z0-9]+") + .Cast() + .Select(m => m.Value); + return tokens.ToList(); + } + else + { + text = Regex.Replace(text, @"[^\w\s]", ""); + text = Regex.Replace(text, @"\s+", " ").Trim(); + return text.Split(' ').ToList(); + } } public async Task> TokenizeAndPreprocessAsync(string text) diff --git a/src/ConsoleTest/Program.cs b/src/ConsoleTest/Program.cs index 424cec7..4059b5f 100644 --- a/src/ConsoleTest/Program.cs +++ b/src/ConsoleTest/Program.cs @@ -45,11 +45,11 @@ public static async Task Main(string[] args) var jsonString = await File.ReadAllTextAsync("movies.json"); - var importTimer = new Stopwatch(); - importTimer.Start(); + var timer = new Stopwatch(); + timer.Start(); - for (var i = 0; i < 10; i++){ + //for (var i = 0; i < 10; i++){ using (JsonDocument document = JsonDocument.Parse(jsonString)) { JsonElement root = document.RootElement; @@ -65,29 +65,30 @@ await Parallel.ForEachAsync(movies.EnumerateArray(), async (movie, cancellationT await vdb.AddTextAsync(text, metadata); } }); - - // foreach (JsonElement movie in movies.EnumerateArray()) - // { - // var text = movie.GetProperty("description").GetString(); - // var metadata = movie.GetProperty("title").GetString(); - - // if (!string.IsNullOrWhiteSpace(text) && !string.IsNullOrWhiteSpace(metadata)) - // { - // await vdb.AddTextAsync(text, metadata); - // } - // } - } } + //} + + timer.Stop(); + Console.WriteLine($"Movie data imported into Vector Database (Elapsed: {timer.ElapsedMilliseconds} ms)"); - importTimer.Stop(); - Console.WriteLine("Movie data imported into Vector Database."); - Console.WriteLine($"Import took {importTimer.ElapsedMilliseconds} ms"); + Console.WriteLine("Saving Vector Database to file..."); + timer.Restart(); + await vdb.SaveToFileAsync("movies.b59vdb"); + timer.Stop(); + Console.WriteLine($"Vector Database saved to file (Elapsed: {timer.ElapsedMilliseconds} ms)"); + Console.WriteLine("Loading Vector Database from file..."); + timer.Restart(); + + await vdb.LoadFromFileAsync("movies.b59vdb"); + + timer.Stop(); + Console.WriteLine($"Vector Database loaded from file (Elapsed: {timer.ElapsedMilliseconds} ms)"); // Paths to the large text files @@ -158,9 +159,7 @@ await Parallel.ForEachAsync(movies.EnumerateArray(), async (movie, cancellationT if (newPrompt != null) { IVectorTextResult result; - var timer = new Stopwatch(); - timer.Start(); - + timer.Restart(); var pageSize = 3; // result = await vdb.Search(newPrompt, diff --git a/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj b/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj index 7763900..ec2c116 100644 --- a/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj +++ b/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj @@ -7,14 +7,14 @@ enable - - - - + + + + - - + + diff --git a/src/OpenAIConsoleTest/Program.cs b/src/OpenAIConsoleTest/Program.cs index 71005af..36d034a 100644 --- a/src/OpenAIConsoleTest/Program.cs +++ b/src/OpenAIConsoleTest/Program.cs @@ -77,15 +77,12 @@ await Parallel.ForEachAsync(movies.EnumerateArray(), async (movie, cancellationT Console.WriteLine(string.Empty); if (newPrompt != null) { - IVectorTextResult result; - var timer = new Stopwatch(); timer.Start(); - var pageSize = 3; // result = await vdb.Search(newPrompt, - result = await vdb.SearchAsync(newPrompt, + var result = await vdb.SearchAsync(newPrompt, threshold: 0.001f, // 0.2f, // Cosine Similarity - Only return results with similarity greater than this threshold // threshold: (float)1.4f, // Euclidean Distance - Only return results with distance less than this threshold diff --git a/src/SharpVector.sln b/src/SharpVector.sln index afa1c89..9b5c1e5 100644 --- a/src/SharpVector.sln +++ b/src/SharpVector.sln @@ -9,6 +9,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Build5Nines.SharpVector", " EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SharpVectorTest", "SharpVectorTest\SharpVectorTest.csproj", "{AFF76051-E043-45EB-9B5F-05D9C45D0DC7}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SharpVectorPerformance", "SharpVectorPerformance\SharpVectorPerformance.csproj", "{AFF76051-E043-45EB-9B5F-05D9C45D0DC7}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Build5Nines.SharpVector.OpenAI", "Build5Nines.SharpVector.OpenAI\Build5Nines.SharpVector.OpenAI.csproj", "{CABF1DBE-8FE1-4EDF-B5DD-B1BFB88D93C3}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharpVectorOpenAITest", "SharpVectorOpenAITest\SharpVectorOpenAITest.csproj", "{04E08FA2-C4B4-47B4-ABB0-6FD57EA5FFFB}" diff --git a/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs b/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs index 6779c78..392d98d 100644 --- a/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs +++ b/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs @@ -29,5 +29,14 @@ public void TestInitialization() Assert.IsNotNull(_database); } + [TestMethod] + public async Task Test_SaveLoad_01() + { + var filename = "openai_test_saveload_01.b59vdb"; + await _database.SaveToFileAsync(filename); + + await _database.LoadFromFileAsync(filename); + } + } } \ No newline at end of file diff --git a/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj b/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj index bf8dd58..f983d11 100644 --- a/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj +++ b/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj @@ -10,7 +10,7 @@ - + @@ -23,8 +23,8 @@ - - + + diff --git a/src/SharpVectorPerformance/MemoryVectorDatabasePerformance.cs b/src/SharpVectorPerformance/MemoryVectorDatabasePerformance.cs new file mode 100644 index 0000000..72524ab --- /dev/null +++ b/src/SharpVectorPerformance/MemoryVectorDatabasePerformance.cs @@ -0,0 +1,70 @@ +namespace SharpVectorPerformance; + +using System.Diagnostics; +using Build5Nines.SharpVector; +using Build5Nines.SharpVector.Id; +using Build5Nines.SharpVector.Preprocessing; +using Build5Nines.SharpVector.VectorCompare; +using Build5Nines.SharpVector.Vectorization; +using Build5Nines.SharpVector.VectorStore; +using Build5Nines.SharpVector.Vocabulary; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; + +[MemoryDiagnoser] +public class MemoryVectorDatabasePerformance +{ + private MemoryVectorDatabase database; + private string fileName = "memory_vector_database_test.b59vdb"; + + [GlobalSetup] + public async Task Setup() + { + database = new MemoryVectorDatabase(); + // // Load Vector Database with some sample text + var textTasks = new List(); + for (int i = 0; i < 100; i++) + { + textTasks.Add(database.AddTextAsync("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", 5.0)); + textTasks.Add(database.AddTextAsync("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic of the same name about a street urchin who finds a magic lamp and uses a genie's wishes to become a prince so he can marry Princess Jasmine.", 5.0)); + textTasks.Add(database.AddTextAsync("The Little Mermaid is a 2023 live-action adaptation of Disney's 1989 animated film of the same name. The movie is about Ariel, the youngest of King Triton's daughters, who is fascinated by the human world and falls in love with Prince Eric.", 5.0)); + textTasks.Add(database.AddTextAsync("Frozen is a 2013 Disney movie about a fearless optimist named Anna who sets off on a journey to find her sister Elsa, whose icy powers have trapped their kingdom in eternal winter.", 5.0)); + textTasks.Add(database.AddTextAsync("Tangled is a 2010 Disney animated comedy adventure film based on the story of Rapunzel. The movie is about a long-lost princess with magical blonde hair who has been locked in a tower her entire life by Gothel, who wants to use Rapunzel's powers for herself.", 5.0)); + textTasks.Add(database.AddTextAsync("Wreck-It Ralph is a 2012 Disney animated film about Ralph, a character who plays the bad guy in the arcade game Fix-It Felix Jr. for 30 years. Ralph is a muscular, 9-foot-tall character with spiky auburn hair, a pink nose, and large hands and feet. He wears burgundy overalls with a broken strap, a plaid shirt with ripped sleeves, and a teal undershirt.", 5.0)); + textTasks.Add(database.AddTextAsync("Iron Man (2008) is a Marvel Studios action, adventure, and sci-fi movie about Tony Stark (Robert Downey Jr.), a billionaire inventor and weapons developer who is kidnapped by terrorists and forced to build a weapon. Instead, Tony uses his ingenuity to build a high-tech suit of armor and escape, becoming the superhero Iron Man. He then returns to the United States to refine the suit and use it to fight crime and terrorism.", 5.0)); + textTasks.Add(database.AddTextAsync("Black Panther is a 2018 Marvel Studios movie about T'Challa, the heir to the isolated African nation of Wakanda, who returns home to take the throne after his father's death. However, T'Challa faces challenges from within his own country, including Killmonger, who wants to abandon Wakanda's isolationist policies and start a global revolution. T'Challa must team up with C.I.A. agent Everett K. Ross and the Dora Milaje, Wakanda's special forces, to prevent Wakanda from being drawn into a world war.", 5.0)); + textTasks.Add(database.AddTextAsync("Black Panther: Wakanda Forever is a 2022 Marvel movie about the Wakandans fighting to protect their country from world powers after the death of King T'Challa. The movie is a sequel to the popular Black Panther and stars Chadwick Boseman as T'Challa, Letitia Wright as Shuri, Angela Bassett as Ramonda, and Tenoch Huerta Mejía as Namor.", 5.0)); + textTasks.Add(database.AddTextAsync("The Incredible Hulk is a 2008 Marvel movie about scientist Bruce Banner (Edward Norton) who turns into a giant green monster called the Hulk when he's angry or frightened. After a gamma radiation accident, Banner is on the run from the military while searching for a cure for his condition.", 5.0)); + textTasks.Add(database.AddTextAsync("Hackers is a 1995 American crime thriller film about a group of high school hackers who discover a criminal plot to use a computer virus to destroy five oil tankers. The film stars Jonny Lee Miller, Angelina Jolie, Jesse Bradford, Matthew Lillard, Laurence Mason, Renoly Santiago, Lorraine Bracco, and Fisher Stevens. Iain Softley directed the film, which was made during the mid-1990s when the internet was becoming popular.", 5.0)); + textTasks.Add(database.AddTextAsync("WarGames is a 1983 American techno-thriller film about a high school computer hacker who accidentally accesses a top secret military supercomputer that controls the U.S. nuclear arsenal. The hacker, David Lightman (Matthew Broderick), starts a game of Global Thermonuclear War, triggering a false alarm that threatens to start World War III. David must convince the computer that he only wanted to play a game and not the real thing, with help from his girlfriend (Ally Sheedy) and a government official (Dabney Coleman)", 5.0)); + textTasks.Add(database.AddTextAsync("Cars is a 2006 Pixar movie about a rookie race car named Lightning McQueen who gets stranded in a small town while on his way to an important race. McQueen accidentally damages the road in Radiator Springs, a forgotten town on Route 66, and is forced to repair it. While there, he meets Sally, Mater, Doc Hudson, and other characters who help him learn that there's more to life than fame and trophies. McQueen finds friendship and love in the town, and begins to reevaluate his priorities. The movie teaches McQueen the importance of caring for others, integrity, and that winning isn't everything.", 5.0)); + textTasks.Add(database.AddTextAsync("The Incredibles is a 2004 Pixar animated action-adventure film about a family of superheroes who are forced to live a normal suburban life while hiding their powers. The movie is set in a retro-futuristic 1960s and has a runtime of 1 hour and 55 minutes.", 5.0)); + textTasks.Add(database.AddTextAsync("Toy Story is a 1995 animated comedy film about the relationship between Woody, a cowboy doll, and Buzz Lightyear, an action figure. The film takes place in a world where toys come to life when humans are not present. Woody is the leader of the toys in Andy's room, including a Tyrannosaurus Rex and Mr. Potato Head. When Buzz becomes Andy's favorite toy, Woody becomes jealous and plots against him. When Andy's family moves, Woody and Buzz must escape the clutches of their neighbor, Sid Phillips, and reunite with Andy.", 5.0)); + textTasks.Add(database.AddTextAsync("In Toy Story 2, Andy's toys are left to their own devices while he goes to Cowboy Camp, and Woody is kidnapped by a toy collector named Al McWhiggin. Buzz Lightyear and the other toys set out on a rescue mission to save Woody before he becomes a museum toy.", 5.0)); + textTasks.Add(database.AddTextAsync("Iron Man 2 is a 2010 action-adventure fantasy film about Tony Stark (Robert Downey Jr.), a billionaire inventor and superhero who must deal with declining health, government pressure, and a vengeful enemy.", 5.0)); + } + // 1700 text documents + + await Task.WhenAll(textTasks.ToArray()); + } + + [Benchmark] + public async Task SaveLoadPerformanceTest_BasicMemoryVectorDatabase_001() + { + await database.SaveToFileAsync(fileName); + + await database.LoadFromFileAsync(fileName); + } + + [Benchmark] + public async Task SavePerformanceTest_BasicMemoryVectorDatabase_001() + { + await database.SaveToFileAsync(fileName); + } + + [Benchmark] + public async Task LoadPerformanceTest_BasicMemoryVectorDatabase_001() + { + await database.LoadFromFileAsync(fileName); + } +} \ No newline at end of file diff --git a/src/SharpVectorPerformance/Program.cs b/src/SharpVectorPerformance/Program.cs new file mode 100644 index 0000000..375e8d0 --- /dev/null +++ b/src/SharpVectorPerformance/Program.cs @@ -0,0 +1,12 @@ +// See https://aka.ms/new-console-template for more information +using BenchmarkDotNet.Running; + +namespace SharpVectorPerformance; + +public class Program +{ + public static void Main(string[] args) + { + BenchmarkRunner.Run(); + } +} \ No newline at end of file diff --git a/src/SharpVectorPerformance/SharpVectorPerformance.csproj b/src/SharpVectorPerformance/SharpVectorPerformance.csproj new file mode 100644 index 0000000..05ef0af --- /dev/null +++ b/src/SharpVectorPerformance/SharpVectorPerformance.csproj @@ -0,0 +1,18 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + + + + diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index 176fd16..58bfa51 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -1,5 +1,7 @@ namespace SharpVectorTest; +using System.Diagnostics; +using System.Threading.Tasks; using Build5Nines.SharpVector; using Build5Nines.SharpVector.Id; using Build5Nines.SharpVector.Preprocessing; @@ -302,6 +304,66 @@ public void Text_Update_01() Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); } + [TestMethod] + public void Text_Update_01_Chinese() + { + var vdb = new MemoryVectorDatabase(); + + // Load Vector Database with Chinese sample text and JSON metadata + var id = vdb.AddText("狮子王是一部1994年的迪士尼动画电影,讲述一个小狮子辛巴必将继承非洲大草原王位的故事。", "{ value: \"元数据初始值\" }"); + + // Verify that search returns the expected text + var results = vdb.Search("狮子"); + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("狮子王")); + + // Update the text + vdb.UpdateText(id, "狮子王是一部非常棒的电影!"); + + // Verify that the text is updated but the metadata remains unchanged + results = vdb.Search("狮子"); + Assert.AreEqual("狮子王是一部非常棒的电影!", results.Texts.First().Text); + Assert.AreEqual("{ value: \"元数据初始值\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public void Text_Update_01_English_and_Chinese() + { + var vdb = new MemoryVectorDatabase(); + + // Load the Vector Database with some initial sample texts. + vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"Lion King Metadata\" }"); + vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic about a street urchin who finds a magic lamp.", "{ value: \"Aladdin Metadata\" }"); + vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's classic animated film about Ariel.", "{ value: \"Little Mermaid Metadata\" }"); + + // Add additional texts to the database. + vdb.AddText("Mulan is an epic tale of bravery and honor in ancient China.", "{ value: \"Mulan Metadata\" }"); + vdb.AddText("Crouching Tiger, Hidden Dragon is a martial arts masterpiece with breathtaking scenes.", "{ value: \"Crouching Metadata\" }"); + vdb.AddText("In the Mood for Love is a visually stunning film about forbidden romance.", "{ value: \"In the Mood Metadata\" }"); + + // Add more Chinese texts. + vdb.AddText("大闹天宫是一部经典 的中国动画电影,讲述孙悟空大闹天宫的故事。", "{ value: \"元数据新增1\" }"); + vdb.AddText("霸王别姬是一部关于 爱与背叛的中国史诗电影。", "{ value: \"元数据新增2\" }"); + + // Verify that a search for "Lion King" returns the expected result. + var lionResults = vdb.Search("Lion King"); + Assert.IsTrue(lionResults.Texts.Any(t => t.Text.Contains("Lion King"))); + Assert.AreEqual("{ value: \"Lion King Metadata\" }", lionResults.Texts.First().Metadata); + + // Verify that the Chinese texts were added. + var daNaoResults = vdb.Search("部经典"); + Assert.IsTrue(daNaoResults.Texts.Any(t => t.Text.Contains("部经典"))); + Assert.AreEqual("{ value: \"元数据新增1\" }", daNaoResults.Texts.First().Metadata); + + var baiJieResults = vdb.Search("霸王别姬"); + Assert.IsTrue(baiJieResults.Texts.Any(t => t.Text.Contains("霸王别姬"))); + Assert.AreEqual("{ value: \"元数据新增2\" }", baiJieResults.Texts.First().Metadata); + + var baiJieResults2 = vdb.Search("宫故事"); + Assert.IsTrue(baiJieResults2.Texts.Any(t => t.Text.Contains("霸王别姬"))); + Assert.AreEqual("{ value: \"元数据新增1\" }", baiJieResults2.Texts.First().Metadata); + } + [TestMethod] public void Text_Metadata_String_01() { @@ -485,6 +547,217 @@ public async Task SaveLoadFile_001() Assert.AreEqual("NewNewNew", thirdResult.Texts.First().Text); Assert.AreEqual(4.5, thirdResult.Texts.First().Metadata); } + + [TestMethod] + public async Task SaveLoadFile_002() + { + var databaseOne = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var textTasks = new List(); + for (int i = 0; i < 100; i++) + { + textTasks.Add(databaseOne.AddTextAsync("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic of the same name about a street urchin who finds a magic lamp and uses a genie's wishes to become a prince so he can marry Princess Jasmine.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("The Little Mermaid is a 2023 live-action adaptation of Disney's 1989 animated film of the same name. The movie is about Ariel, the youngest of King Triton's daughters, who is fascinated by the human world and falls in love with Prince Eric.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Frozen is a 2013 Disney movie about a fearless optimist named Anna who sets off on a journey to find her sister Elsa, whose icy powers have trapped their kingdom in eternal winter.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Tangled is a 2010 Disney animated comedy adventure film based on the story of Rapunzel. The movie is about a long-lost princess with magical blonde hair who has been locked in a tower her entire life by Gothel, who wants to use Rapunzel's powers for herself.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Wreck-It Ralph is a 2012 Disney animated film about Ralph, a character who plays the bad guy in the arcade game Fix-It Felix Jr. for 30 years. Ralph is a muscular, 9-foot-tall character with spiky auburn hair, a pink nose, and large hands and feet. He wears burgundy overalls with a broken strap, a plaid shirt with ripped sleeves, and a teal undershirt.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Iron Man (2008) is a Marvel Studios action, adventure, and sci-fi movie about Tony Stark (Robert Downey Jr.), a billionaire inventor and weapons developer who is kidnapped by terrorists and forced to build a weapon. Instead, Tony uses his ingenuity to build a high-tech suit of armor and escape, becoming the superhero Iron Man. He then returns to the United States to refine the suit and use it to fight crime and terrorism.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Black Panther is a 2018 Marvel Studios movie about T'Challa, the heir to the isolated African nation of Wakanda, who returns home to take the throne after his father's death. However, T'Challa faces challenges from within his own country, including Killmonger, who wants to abandon Wakanda's isolationist policies and start a global revolution. T'Challa must team up with C.I.A. agent Everett K. Ross and the Dora Milaje, Wakanda's special forces, to prevent Wakanda from being drawn into a world war.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Black Panther: Wakanda Forever is a 2022 Marvel movie about the Wakandans fighting to protect their country from world powers after the death of King T'Challa. The movie is a sequel to the popular Black Panther and stars Chadwick Boseman as T'Challa, Letitia Wright as Shuri, Angela Bassett as Ramonda, and Tenoch Huerta Mejía as Namor.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("The Incredible Hulk is a 2008 Marvel movie about scientist Bruce Banner (Edward Norton) who turns into a giant green monster called the Hulk when he's angry or frightened. After a gamma radiation accident, Banner is on the run from the military while searching for a cure for his condition.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Hackers is a 1995 American crime thriller film about a group of high school hackers who discover a criminal plot to use a computer virus to destroy five oil tankers. The film stars Jonny Lee Miller, Angelina Jolie, Jesse Bradford, Matthew Lillard, Laurence Mason, Renoly Santiago, Lorraine Bracco, and Fisher Stevens. Iain Softley directed the film, which was made during the mid-1990s when the internet was becoming popular.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("WarGames is a 1983 American techno-thriller film about a high school computer hacker who accidentally accesses a top secret military supercomputer that controls the U.S. nuclear arsenal. The hacker, David Lightman (Matthew Broderick), starts a game of Global Thermonuclear War, triggering a false alarm that threatens to start World War III. David must convince the computer that he only wanted to play a game and not the real thing, with help from his girlfriend (Ally Sheedy) and a government official (Dabney Coleman)", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Cars is a 2006 Pixar movie about a rookie race car named Lightning McQueen who gets stranded in a small town while on his way to an important race. McQueen accidentally damages the road in Radiator Springs, a forgotten town on Route 66, and is forced to repair it. While there, he meets Sally, Mater, Doc Hudson, and other characters who help him learn that there's more to life than fame and trophies. McQueen finds friendship and love in the town, and begins to reevaluate his priorities. The movie teaches McQueen the importance of caring for others, integrity, and that winning isn't everything.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("The Incredibles is a 2004 Pixar animated action-adventure film about a family of superheroes who are forced to live a normal suburban life while hiding their powers. The movie is set in a retro-futuristic 1960s and has a runtime of 1 hour and 55 minutes.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Toy Story is a 1995 animated comedy film about the relationship between Woody, a cowboy doll, and Buzz Lightyear, an action figure. The film takes place in a world where toys come to life when humans are not present. Woody is the leader of the toys in Andy's room, including a Tyrannosaurus Rex and Mr. Potato Head. When Buzz becomes Andy's favorite toy, Woody becomes jealous and plots against him. When Andy's family moves, Woody and Buzz must escape the clutches of their neighbor, Sid Phillips, and reunite with Andy.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("In Toy Story 2, Andy's toys are left to their own devices while he goes to Cowboy Camp, and Woody is kidnapped by a toy collector named Al McWhiggin. Buzz Lightyear and the other toys set out on a rescue mission to save Woody before he becomes a museum toy.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Iron Man 2 is a 2010 action-adventure fantasy film about Tony Stark (Robert Downey Jr.), a billionaire inventor and superhero who must deal with declining health, government pressure, and a vengeful enemy.", 5.0)); + } + + await Task.WhenAll(textTasks.ToArray()); + + var firstResult = await databaseOne.SearchAsync("Lion King", pageCount: 5); + Assert.AreEqual(5, firstResult.Texts.Count()); + Assert.IsTrue(firstResult.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual(5.0, firstResult.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, firstResult.Texts.First().VectorComparison); + + var fileName = "vector_database.b59vdb"; + var timer = new Stopwatch(); + + timer.Start(); + await databaseOne.SaveToFileAsync(fileName); + timer.Stop(); + Console.WriteLine($"SaveLoadFile_002: Save File: {timer.ElapsedMilliseconds} ms"); + + // This is a smoke test to just make sure that the save file is not taking way longer than expected + Assert.IsTrue(timer.ElapsedMilliseconds < 300, $"SaveLoadFile_002: Save File took too long - Expected: < 300 - Actual: {timer.ElapsedMilliseconds} ms"); + + var databaseTwo = new MemoryVectorDatabase(); + timer.Restart(); + await databaseTwo.LoadFromFileAsync(fileName); + timer.Stop(); + Console.WriteLine($"SaveLoadFile_002: Load File: {timer.ElapsedMilliseconds} ms"); + + // This is a smoke test to just make sure that the load file is not taking way longer than expected + Assert.IsTrue(timer.ElapsedMilliseconds < 300, $"SaveLoadFile_002: Load File took too long - Expected: < 300 - Actual: {timer.ElapsedMilliseconds} ms"); + + var secondResult = await databaseTwo.SearchAsync("Lion King", pageCount: 5); + Assert.AreEqual(5, secondResult.Texts.Count()); + Assert.IsTrue(secondResult.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual(5.0, secondResult.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, secondResult.Texts.First().VectorComparison); + + // Compare both results + Assert.AreEqual(firstResult.Texts.Count(), secondResult.Texts.Count()); + + databaseTwo.AddText("NewNewNew", 4.5); + var thirdResult = await databaseTwo.SearchAsync("NewNewNew", pageCount: 5); + Assert.AreEqual("NewNewNew", thirdResult.Texts.First().Text); + Assert.AreEqual(4.5, thirdResult.Texts.First().Metadata); + } + + [TestMethod] + public async Task DatabaseFile_LoadDatabaseInfo_001() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + vdb.SaveToFile("DatabaseFile_LoadDatabaseInfo_001.b59vdb"); + + var databaseInfo = await DatabaseFile.LoadDatabaseInfoAsync("DatabaseFile_LoadDatabaseInfo_001.b59vdb"); + + Assert.AreEqual("Build5Nines.SharpVector", databaseInfo.Schema); + Assert.AreEqual("1.0.0", databaseInfo.Version); + Assert.AreEqual("Build5Nines.SharpVector.MemoryVectorDatabase`1[[System.String, System.Private.CoreLib, Version=8.0.0.0, Culture=neutral, PublicKeyToken=7cec85d7bea7798e]]", databaseInfo.ClassType); + } + + [TestMethod] + public async Task DatabaseFile_LoadDatabaseInfo_002() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var stream = new MemoryStream(); + await vdb.SerializeToJsonStreamAsync(stream); + stream.Position = 0; + + var databaseInfo = await DatabaseFile.LoadDatabaseInfoAsync(stream); + + Assert.AreEqual("Build5Nines.SharpVector", databaseInfo.Schema); + Assert.AreEqual("1.0.0", databaseInfo.Version); + Assert.AreEqual("Build5Nines.SharpVector.MemoryVectorDatabase`1[[System.String, System.Private.CoreLib, Version=8.0.0.0, Culture=neutral, PublicKeyToken=7cec85d7bea7798e]]", databaseInfo.ClassType); + } + + [TestMethod] + public async Task DatabaseFile_LoadStream_002() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var stream = new MemoryStream(); + await vdb.SerializeToJsonStreamAsync(stream); + stream.Position = 0; + + vdb = await DatabaseFile.Load, string>(stream); + + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task DatabaseFile_LoadStream_003() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var stream = new MemoryStream(); + await vdb.SerializeToJsonStreamAsync(stream); + stream.Position = 0; + + vdb = await DatabaseFile.Load(stream); + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task DatabaseFile_LoadFile_002() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var filename = "DatabaseFile_LoadFile_003.b59vdb"; + await vdb.SaveToFileAsync(filename); + + vdb = await DatabaseFile.Load, string>(filename); + + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task DatabaseFile_LoadFile_003() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var filename = "DatabaseFile_LoadFile_003.b59vdb"; + await vdb.SaveToFileAsync(filename); + + vdb = await DatabaseFile.Load(filename); + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } }