From 2e8e1df81f25fadd5c02559ba2f98fcdbd599441 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 13:24:27 -0500 Subject: [PATCH 01/33] added database info (schema, version) to exported vector database file --- .../docs/README.md | 4 ++ src/Build5Nines.SharpVector/DatabaseInfo.cs | 6 +++ .../MemoryVectorDatabaseBase.cs | 39 ++++++++++++++---- src/ConsoleTest/Program.cs | 41 +++++++++---------- 4 files changed, 61 insertions(+), 29 deletions(-) create mode 100644 src/Build5Nines.SharpVector/DatabaseInfo.cs diff --git a/src/Build5Nines.SharpVector.OpenAI/docs/README.md b/src/Build5Nines.SharpVector.OpenAI/docs/README.md index e097632..760ee5b 100644 --- a/src/Build5Nines.SharpVector.OpenAI/docs/README.md +++ b/src/Build5Nines.SharpVector.OpenAI/docs/README.md @@ -1,3 +1,7 @@ Build5Nines.SharpVector.OpenAI is a lightweight in-memory Vector Database for use in any .NET application that connects to an embeddings model running in Azure OpenAI for generating the text embeddings. The `Build5Nines.SharpVector.OpenAI.BasicOpenAIMemoryVectorDatabase` class uses an OpenAI Embeddings Client with Cosine similarity search. + +## Tutorials + +- [Enhanced In-Memory Text Vector Search in .NET with SharpVector and OpenAI Embeddings](https://build5nines.com/enhanced-in-memory-text-vector-search-in-net-with-sharpvector-and-openai-embeddings/?utm_source=github&utm_medium=sharpvector) by Chris Pietschmann \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/DatabaseInfo.cs b/src/Build5Nines.SharpVector/DatabaseInfo.cs new file mode 100644 index 0000000..aadde0d --- /dev/null +++ b/src/Build5Nines.SharpVector/DatabaseInfo.cs @@ -0,0 +1,6 @@ +internal class DatabaseInfo +{ + public string? Schema { get; set; } = "Build5Nines.SharpVector"; + public string? Version { get; set; } + public string? VectorDatabaseClassType { get; set; } +} \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs index 252a2b4..8b22742 100644 --- a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs @@ -7,6 +7,7 @@ using System.Collections.Concurrent; using System.IO.Compression; using System.Runtime.CompilerServices; +using System.Text.Json; namespace Build5Nines.SharpVector; @@ -254,13 +255,18 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) using (var archive = new ZipArchive(stream, ZipArchiveMode.Create, true)) { - var entryDatabaseType = archive.CreateEntry("DatabaseType.txt"); + var entryDatabaseType = archive.CreateEntry("Database.json"); using (var entryStream = entryDatabaseType.Open()) { - var typeName = this.GetType().FullName; - if (typeName != null) + var databaseInfo = new DatabaseInfo { + Version = "1.0.0", + VectorDatabaseClassType = this.GetType().FullName + }; + var databaseInfoJson = JsonSerializer.Serialize(databaseInfo); + + if (databaseInfoJson != null) { - var databaseTypeBytes = System.Text.Encoding.UTF8.GetBytes(typeName); + var databaseTypeBytes = System.Text.Encoding.UTF8.GetBytes(databaseInfoJson); await entryStream.WriteAsync(databaseTypeBytes); await entryStream.FlushAsync(); } @@ -307,7 +313,7 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) using (var archive = new ZipArchive(stream, ZipArchiveMode.Read)) { - var entryDatabaseType = archive.GetEntry("DatabaseType.txt"); + var entryDatabaseType = archive.GetEntry("Database.json"); if (entryDatabaseType != null) { using (var entryStream = entryDatabaseType.Open()) @@ -318,11 +324,28 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) var databaseTypeBytes = new byte[databaseTypeStream.Length]; await databaseTypeStream.ReadAsync(databaseTypeBytes); - var databaseType = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + var databaseInfoJson = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + + var databaseInfo = JsonSerializer.Deserialize(databaseInfoJson); + + if (databaseInfo == null) + { + throw new InvalidOperationException("Database info entry is null."); + } + + if (databaseInfo.Schema != "Build5Nines.SharpVector") + { + throw new InvalidOperationException("The database schema does not match the expected schema."); + } + + if (databaseInfo.Version != "1.0.0") + { + throw new InvalidOperationException($"The database version does not match the expected version (Expected: 1.0.0 - Actual: {databaseInfo.Version})."); + } - if (databaseType != this.GetType().FullName) + if (databaseInfo.VectorDatabaseClassType != this.GetType().FullName) { - throw new InvalidOperationException($"The database type does not match the expected type [Expected: {databaseType}] "); + throw new InvalidOperationException($"The database type does not match the expected type [Expected: {databaseInfo.VectorDatabaseClassType}] "); } } } diff --git a/src/ConsoleTest/Program.cs b/src/ConsoleTest/Program.cs index 424cec7..4059b5f 100644 --- a/src/ConsoleTest/Program.cs +++ b/src/ConsoleTest/Program.cs @@ -45,11 +45,11 @@ public static async Task Main(string[] args) var jsonString = await File.ReadAllTextAsync("movies.json"); - var importTimer = new Stopwatch(); - importTimer.Start(); + var timer = new Stopwatch(); + timer.Start(); - for (var i = 0; i < 10; i++){ + //for (var i = 0; i < 10; i++){ using (JsonDocument document = JsonDocument.Parse(jsonString)) { JsonElement root = document.RootElement; @@ -65,29 +65,30 @@ await Parallel.ForEachAsync(movies.EnumerateArray(), async (movie, cancellationT await vdb.AddTextAsync(text, metadata); } }); - - // foreach (JsonElement movie in movies.EnumerateArray()) - // { - // var text = movie.GetProperty("description").GetString(); - // var metadata = movie.GetProperty("title").GetString(); - - // if (!string.IsNullOrWhiteSpace(text) && !string.IsNullOrWhiteSpace(metadata)) - // { - // await vdb.AddTextAsync(text, metadata); - // } - // } - } } + //} + + timer.Stop(); + Console.WriteLine($"Movie data imported into Vector Database (Elapsed: {timer.ElapsedMilliseconds} ms)"); - importTimer.Stop(); - Console.WriteLine("Movie data imported into Vector Database."); - Console.WriteLine($"Import took {importTimer.ElapsedMilliseconds} ms"); + Console.WriteLine("Saving Vector Database to file..."); + timer.Restart(); + await vdb.SaveToFileAsync("movies.b59vdb"); + timer.Stop(); + Console.WriteLine($"Vector Database saved to file (Elapsed: {timer.ElapsedMilliseconds} ms)"); + Console.WriteLine("Loading Vector Database from file..."); + timer.Restart(); + + await vdb.LoadFromFileAsync("movies.b59vdb"); + + timer.Stop(); + Console.WriteLine($"Vector Database loaded from file (Elapsed: {timer.ElapsedMilliseconds} ms)"); // Paths to the large text files @@ -158,9 +159,7 @@ await Parallel.ForEachAsync(movies.EnumerateArray(), async (movie, cancellationT if (newPrompt != null) { IVectorTextResult result; - var timer = new Stopwatch(); - timer.Start(); - + timer.Restart(); var pageSize = 3; // result = await vdb.Search(newPrompt, From 2fa0262be7db65e3b34fe3ad56d4fa92d489cccb Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 13:38:52 -0500 Subject: [PATCH 02/33] add another save/load test --- src/SharpVectorTest/VectorDatabaseTests.cs | 70 ++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index 176fd16..fbf8dea 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -1,5 +1,6 @@ namespace SharpVectorTest; +using System.Diagnostics; using Build5Nines.SharpVector; using Build5Nines.SharpVector.Id; using Build5Nines.SharpVector.Preprocessing; @@ -485,6 +486,75 @@ public async Task SaveLoadFile_001() Assert.AreEqual("NewNewNew", thirdResult.Texts.First().Text); Assert.AreEqual(4.5, thirdResult.Texts.First().Metadata); } + + [TestMethod] + public async Task SaveLoadFile_002() + { + var databaseOne = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var textTasks = new List(); + for (int i = 0; i < 100; i++) + { + textTasks.Add(databaseOne.AddTextAsync("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic of the same name about a street urchin who finds a magic lamp and uses a genie's wishes to become a prince so he can marry Princess Jasmine.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("The Little Mermaid is a 2023 live-action adaptation of Disney's 1989 animated film of the same name. The movie is about Ariel, the youngest of King Triton's daughters, who is fascinated by the human world and falls in love with Prince Eric.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Frozen is a 2013 Disney movie about a fearless optimist named Anna who sets off on a journey to find her sister Elsa, whose icy powers have trapped their kingdom in eternal winter.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Tangled is a 2010 Disney animated comedy adventure film based on the story of Rapunzel. The movie is about a long-lost princess with magical blonde hair who has been locked in a tower her entire life by Gothel, who wants to use Rapunzel's powers for herself.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Wreck-It Ralph is a 2012 Disney animated film about Ralph, a character who plays the bad guy in the arcade game Fix-It Felix Jr. for 30 years. Ralph is a muscular, 9-foot-tall character with spiky auburn hair, a pink nose, and large hands and feet. He wears burgundy overalls with a broken strap, a plaid shirt with ripped sleeves, and a teal undershirt.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Iron Man (2008) is a Marvel Studios action, adventure, and sci-fi movie about Tony Stark (Robert Downey Jr.), a billionaire inventor and weapons developer who is kidnapped by terrorists and forced to build a weapon. Instead, Tony uses his ingenuity to build a high-tech suit of armor and escape, becoming the superhero Iron Man. He then returns to the United States to refine the suit and use it to fight crime and terrorism.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Black Panther is a 2018 Marvel Studios movie about T'Challa, the heir to the isolated African nation of Wakanda, who returns home to take the throne after his father's death. However, T'Challa faces challenges from within his own country, including Killmonger, who wants to abandon Wakanda's isolationist policies and start a global revolution. T'Challa must team up with C.I.A. agent Everett K. Ross and the Dora Milaje, Wakanda's special forces, to prevent Wakanda from being drawn into a world war.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Black Panther: Wakanda Forever is a 2022 Marvel movie about the Wakandans fighting to protect their country from world powers after the death of King T'Challa. The movie is a sequel to the popular Black Panther and stars Chadwick Boseman as T'Challa, Letitia Wright as Shuri, Angela Bassett as Ramonda, and Tenoch Huerta Mejía as Namor.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("The Incredible Hulk is a 2008 Marvel movie about scientist Bruce Banner (Edward Norton) who turns into a giant green monster called the Hulk when he's angry or frightened. After a gamma radiation accident, Banner is on the run from the military while searching for a cure for his condition.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Hackers is a 1995 American crime thriller film about a group of high school hackers who discover a criminal plot to use a computer virus to destroy five oil tankers. The film stars Jonny Lee Miller, Angelina Jolie, Jesse Bradford, Matthew Lillard, Laurence Mason, Renoly Santiago, Lorraine Bracco, and Fisher Stevens. Iain Softley directed the film, which was made during the mid-1990s when the internet was becoming popular.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("WarGames is a 1983 American techno-thriller film about a high school computer hacker who accidentally accesses a top secret military supercomputer that controls the U.S. nuclear arsenal. The hacker, David Lightman (Matthew Broderick), starts a game of Global Thermonuclear War, triggering a false alarm that threatens to start World War III. David must convince the computer that he only wanted to play a game and not the real thing, with help from his girlfriend (Ally Sheedy) and a government official (Dabney Coleman)", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Cars is a 2006 Pixar movie about a rookie race car named Lightning McQueen who gets stranded in a small town while on his way to an important race. McQueen accidentally damages the road in Radiator Springs, a forgotten town on Route 66, and is forced to repair it. While there, he meets Sally, Mater, Doc Hudson, and other characters who help him learn that there's more to life than fame and trophies. McQueen finds friendship and love in the town, and begins to reevaluate his priorities. The movie teaches McQueen the importance of caring for others, integrity, and that winning isn't everything.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("The Incredibles is a 2004 Pixar animated action-adventure film about a family of superheroes who are forced to live a normal suburban life while hiding their powers. The movie is set in a retro-futuristic 1960s and has a runtime of 1 hour and 55 minutes.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Toy Story is a 1995 animated comedy film about the relationship between Woody, a cowboy doll, and Buzz Lightyear, an action figure. The film takes place in a world where toys come to life when humans are not present. Woody is the leader of the toys in Andy's room, including a Tyrannosaurus Rex and Mr. Potato Head. When Buzz becomes Andy's favorite toy, Woody becomes jealous and plots against him. When Andy's family moves, Woody and Buzz must escape the clutches of their neighbor, Sid Phillips, and reunite with Andy.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("In Toy Story 2, Andy's toys are left to their own devices while he goes to Cowboy Camp, and Woody is kidnapped by a toy collector named Al McWhiggin. Buzz Lightyear and the other toys set out on a rescue mission to save Woody before he becomes a museum toy.", 5.0)); + textTasks.Add(databaseOne.AddTextAsync("Iron Man 2 is a 2010 action-adventure fantasy film about Tony Stark (Robert Downey Jr.), a billionaire inventor and superhero who must deal with declining health, government pressure, and a vengeful enemy.", 5.0)); + } + + await Task.WhenAll(textTasks.ToArray()); + + var firstResult = await databaseOne.SearchAsync("Lion King", pageCount: 5); + Assert.AreEqual(5, firstResult.Texts.Count()); + Assert.IsTrue(firstResult.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual(5.0, firstResult.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, firstResult.Texts.First().VectorComparison); + + var fileName = "vector_database.b59vdb"; + var timer = new Stopwatch(); + + timer.Start(); + await databaseOne.SaveToFileAsync(fileName); + timer.Stop(); + Console.WriteLine($"SaveLoadFile_002: Save File: {timer.ElapsedMilliseconds} ms"); + + Assert.IsTrue(timer.ElapsedMilliseconds < 200, $"SaveLoadFile_002: Save File took too long - Expected: < 200 - Actual: {timer.ElapsedMilliseconds} ms"); + + var databaseTwo = new MemoryVectorDatabase(); + timer.Restart(); + await databaseTwo.LoadFromFileAsync(fileName); + timer.Stop(); + Console.WriteLine($"SaveLoadFile_002: Load File: {timer.ElapsedMilliseconds} ms"); + + Assert.IsTrue(timer.ElapsedMilliseconds < 200, $"SaveLoadFile_002: Load File took too long - Expected: < 200 - Actual: {timer.ElapsedMilliseconds} ms"); + + var secondResult = await databaseTwo.SearchAsync("Lion King", pageCount: 5); + Assert.AreEqual(5, secondResult.Texts.Count()); + Assert.IsTrue(secondResult.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual(5.0, secondResult.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, secondResult.Texts.First().VectorComparison); + + // Compare both results + Assert.AreEqual(firstResult.Texts.Count(), secondResult.Texts.Count()); + + databaseTwo.AddText("NewNewNew", 4.5); + var thirdResult = await databaseTwo.SearchAsync("NewNewNew", pageCount: 5); + Assert.AreEqual("NewNewNew", thirdResult.Texts.First().Text); + Assert.AreEqual(4.5, thirdResult.Texts.First().Metadata); + } } From b14235140c70aa37f0e891499c35f8017076deee Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 14:04:23 -0500 Subject: [PATCH 03/33] added BenchmarkDotNet tests for save/load vector database --- .gitignore | 2 + src/SharpVector.sln | 2 + .../MemoryVectorDatabasePerformance.cs | 70 +++++++++++++++++++ src/SharpVectorPerformance/Program.cs | 12 ++++ .../SharpVectorPerformance.csproj | 18 +++++ 5 files changed, 104 insertions(+) create mode 100644 src/SharpVectorPerformance/MemoryVectorDatabasePerformance.cs create mode 100644 src/SharpVectorPerformance/Program.cs create mode 100644 src/SharpVectorPerformance/SharpVectorPerformance.csproj diff --git a/.gitignore b/.gitignore index 326e77b..2cde6db 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ obj bin .DS_Store + +BenchmarkDotNet.Artifacts/ diff --git a/src/SharpVector.sln b/src/SharpVector.sln index afa1c89..9b5c1e5 100644 --- a/src/SharpVector.sln +++ b/src/SharpVector.sln @@ -9,6 +9,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Build5Nines.SharpVector", " EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SharpVectorTest", "SharpVectorTest\SharpVectorTest.csproj", "{AFF76051-E043-45EB-9B5F-05D9C45D0DC7}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SharpVectorPerformance", "SharpVectorPerformance\SharpVectorPerformance.csproj", "{AFF76051-E043-45EB-9B5F-05D9C45D0DC7}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Build5Nines.SharpVector.OpenAI", "Build5Nines.SharpVector.OpenAI\Build5Nines.SharpVector.OpenAI.csproj", "{CABF1DBE-8FE1-4EDF-B5DD-B1BFB88D93C3}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharpVectorOpenAITest", "SharpVectorOpenAITest\SharpVectorOpenAITest.csproj", "{04E08FA2-C4B4-47B4-ABB0-6FD57EA5FFFB}" diff --git a/src/SharpVectorPerformance/MemoryVectorDatabasePerformance.cs b/src/SharpVectorPerformance/MemoryVectorDatabasePerformance.cs new file mode 100644 index 0000000..72524ab --- /dev/null +++ b/src/SharpVectorPerformance/MemoryVectorDatabasePerformance.cs @@ -0,0 +1,70 @@ +namespace SharpVectorPerformance; + +using System.Diagnostics; +using Build5Nines.SharpVector; +using Build5Nines.SharpVector.Id; +using Build5Nines.SharpVector.Preprocessing; +using Build5Nines.SharpVector.VectorCompare; +using Build5Nines.SharpVector.Vectorization; +using Build5Nines.SharpVector.VectorStore; +using Build5Nines.SharpVector.Vocabulary; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; + +[MemoryDiagnoser] +public class MemoryVectorDatabasePerformance +{ + private MemoryVectorDatabase database; + private string fileName = "memory_vector_database_test.b59vdb"; + + [GlobalSetup] + public async Task Setup() + { + database = new MemoryVectorDatabase(); + // // Load Vector Database with some sample text + var textTasks = new List(); + for (int i = 0; i < 100; i++) + { + textTasks.Add(database.AddTextAsync("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", 5.0)); + textTasks.Add(database.AddTextAsync("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic of the same name about a street urchin who finds a magic lamp and uses a genie's wishes to become a prince so he can marry Princess Jasmine.", 5.0)); + textTasks.Add(database.AddTextAsync("The Little Mermaid is a 2023 live-action adaptation of Disney's 1989 animated film of the same name. The movie is about Ariel, the youngest of King Triton's daughters, who is fascinated by the human world and falls in love with Prince Eric.", 5.0)); + textTasks.Add(database.AddTextAsync("Frozen is a 2013 Disney movie about a fearless optimist named Anna who sets off on a journey to find her sister Elsa, whose icy powers have trapped their kingdom in eternal winter.", 5.0)); + textTasks.Add(database.AddTextAsync("Tangled is a 2010 Disney animated comedy adventure film based on the story of Rapunzel. The movie is about a long-lost princess with magical blonde hair who has been locked in a tower her entire life by Gothel, who wants to use Rapunzel's powers for herself.", 5.0)); + textTasks.Add(database.AddTextAsync("Wreck-It Ralph is a 2012 Disney animated film about Ralph, a character who plays the bad guy in the arcade game Fix-It Felix Jr. for 30 years. Ralph is a muscular, 9-foot-tall character with spiky auburn hair, a pink nose, and large hands and feet. He wears burgundy overalls with a broken strap, a plaid shirt with ripped sleeves, and a teal undershirt.", 5.0)); + textTasks.Add(database.AddTextAsync("Iron Man (2008) is a Marvel Studios action, adventure, and sci-fi movie about Tony Stark (Robert Downey Jr.), a billionaire inventor and weapons developer who is kidnapped by terrorists and forced to build a weapon. Instead, Tony uses his ingenuity to build a high-tech suit of armor and escape, becoming the superhero Iron Man. He then returns to the United States to refine the suit and use it to fight crime and terrorism.", 5.0)); + textTasks.Add(database.AddTextAsync("Black Panther is a 2018 Marvel Studios movie about T'Challa, the heir to the isolated African nation of Wakanda, who returns home to take the throne after his father's death. However, T'Challa faces challenges from within his own country, including Killmonger, who wants to abandon Wakanda's isolationist policies and start a global revolution. T'Challa must team up with C.I.A. agent Everett K. Ross and the Dora Milaje, Wakanda's special forces, to prevent Wakanda from being drawn into a world war.", 5.0)); + textTasks.Add(database.AddTextAsync("Black Panther: Wakanda Forever is a 2022 Marvel movie about the Wakandans fighting to protect their country from world powers after the death of King T'Challa. The movie is a sequel to the popular Black Panther and stars Chadwick Boseman as T'Challa, Letitia Wright as Shuri, Angela Bassett as Ramonda, and Tenoch Huerta Mejía as Namor.", 5.0)); + textTasks.Add(database.AddTextAsync("The Incredible Hulk is a 2008 Marvel movie about scientist Bruce Banner (Edward Norton) who turns into a giant green monster called the Hulk when he's angry or frightened. After a gamma radiation accident, Banner is on the run from the military while searching for a cure for his condition.", 5.0)); + textTasks.Add(database.AddTextAsync("Hackers is a 1995 American crime thriller film about a group of high school hackers who discover a criminal plot to use a computer virus to destroy five oil tankers. The film stars Jonny Lee Miller, Angelina Jolie, Jesse Bradford, Matthew Lillard, Laurence Mason, Renoly Santiago, Lorraine Bracco, and Fisher Stevens. Iain Softley directed the film, which was made during the mid-1990s when the internet was becoming popular.", 5.0)); + textTasks.Add(database.AddTextAsync("WarGames is a 1983 American techno-thriller film about a high school computer hacker who accidentally accesses a top secret military supercomputer that controls the U.S. nuclear arsenal. The hacker, David Lightman (Matthew Broderick), starts a game of Global Thermonuclear War, triggering a false alarm that threatens to start World War III. David must convince the computer that he only wanted to play a game and not the real thing, with help from his girlfriend (Ally Sheedy) and a government official (Dabney Coleman)", 5.0)); + textTasks.Add(database.AddTextAsync("Cars is a 2006 Pixar movie about a rookie race car named Lightning McQueen who gets stranded in a small town while on his way to an important race. McQueen accidentally damages the road in Radiator Springs, a forgotten town on Route 66, and is forced to repair it. While there, he meets Sally, Mater, Doc Hudson, and other characters who help him learn that there's more to life than fame and trophies. McQueen finds friendship and love in the town, and begins to reevaluate his priorities. The movie teaches McQueen the importance of caring for others, integrity, and that winning isn't everything.", 5.0)); + textTasks.Add(database.AddTextAsync("The Incredibles is a 2004 Pixar animated action-adventure film about a family of superheroes who are forced to live a normal suburban life while hiding their powers. The movie is set in a retro-futuristic 1960s and has a runtime of 1 hour and 55 minutes.", 5.0)); + textTasks.Add(database.AddTextAsync("Toy Story is a 1995 animated comedy film about the relationship between Woody, a cowboy doll, and Buzz Lightyear, an action figure. The film takes place in a world where toys come to life when humans are not present. Woody is the leader of the toys in Andy's room, including a Tyrannosaurus Rex and Mr. Potato Head. When Buzz becomes Andy's favorite toy, Woody becomes jealous and plots against him. When Andy's family moves, Woody and Buzz must escape the clutches of their neighbor, Sid Phillips, and reunite with Andy.", 5.0)); + textTasks.Add(database.AddTextAsync("In Toy Story 2, Andy's toys are left to their own devices while he goes to Cowboy Camp, and Woody is kidnapped by a toy collector named Al McWhiggin. Buzz Lightyear and the other toys set out on a rescue mission to save Woody before he becomes a museum toy.", 5.0)); + textTasks.Add(database.AddTextAsync("Iron Man 2 is a 2010 action-adventure fantasy film about Tony Stark (Robert Downey Jr.), a billionaire inventor and superhero who must deal with declining health, government pressure, and a vengeful enemy.", 5.0)); + } + // 1700 text documents + + await Task.WhenAll(textTasks.ToArray()); + } + + [Benchmark] + public async Task SaveLoadPerformanceTest_BasicMemoryVectorDatabase_001() + { + await database.SaveToFileAsync(fileName); + + await database.LoadFromFileAsync(fileName); + } + + [Benchmark] + public async Task SavePerformanceTest_BasicMemoryVectorDatabase_001() + { + await database.SaveToFileAsync(fileName); + } + + [Benchmark] + public async Task LoadPerformanceTest_BasicMemoryVectorDatabase_001() + { + await database.LoadFromFileAsync(fileName); + } +} \ No newline at end of file diff --git a/src/SharpVectorPerformance/Program.cs b/src/SharpVectorPerformance/Program.cs new file mode 100644 index 0000000..375e8d0 --- /dev/null +++ b/src/SharpVectorPerformance/Program.cs @@ -0,0 +1,12 @@ +// See https://aka.ms/new-console-template for more information +using BenchmarkDotNet.Running; + +namespace SharpVectorPerformance; + +public class Program +{ + public static void Main(string[] args) + { + BenchmarkRunner.Run(); + } +} \ No newline at end of file diff --git a/src/SharpVectorPerformance/SharpVectorPerformance.csproj b/src/SharpVectorPerformance/SharpVectorPerformance.csproj new file mode 100644 index 0000000..05ef0af --- /dev/null +++ b/src/SharpVectorPerformance/SharpVectorPerformance.csproj @@ -0,0 +1,18 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + + + + From c923da7b8721b6eae6f82b764cd54f22f529ece5 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:37:45 -0500 Subject: [PATCH 04/33] Update build-release.yml --- .github/workflows/build-release.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 1e87782..ee8b8ba 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -30,6 +30,9 @@ jobs: - name: Build run: dotnet build --configuration Release --no-restore + + - name: Performance + run: dotnet run --project src/SharpVectorPerformance --configuration Release # - name: Publish # run: dotnet publish --configuration Release --output ./publish --no-build @@ -40,7 +43,13 @@ jobs: # name: release-build # path: ./publish - - name: Upload artifact + - name: Upload Performance artifact + uses: actions/upload-artifact@v4 + with: + name: performance-results + path: 'src/SharpVectorPerformance/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' + + - name: Upload Nuget artifact uses: actions/upload-artifact@v4 with: name: nuget-package From 767b220e2e8c8635b4c8277cd0e0d8ed80a6bfae Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:38:49 -0500 Subject: [PATCH 05/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index ee8b8ba..402afaf 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -32,7 +32,7 @@ jobs: run: dotnet build --configuration Release --no-restore - name: Performance - run: dotnet run --project src/SharpVectorPerformance --configuration Release + run: dotnet run --project src/SharpVectorPerformance/SharpVectorPerformance.csproj --configuration Release # - name: Publish # run: dotnet publish --configuration Release --output ./publish --no-build From f41cc5b02a495c74c2b57cfeef2806799722b6a6 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:39:27 -0500 Subject: [PATCH 06/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 402afaf..51dba66 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -32,7 +32,7 @@ jobs: run: dotnet build --configuration Release --no-restore - name: Performance - run: dotnet run --project src/SharpVectorPerformance/SharpVectorPerformance.csproj --configuration Release + run: dotnet run --project SharpVectorPerformance --configuration Release # - name: Publish # run: dotnet publish --configuration Release --output ./publish --no-build From 94dd8dae75dea3516a3e7e2e1aa78196bffa74fd Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:41:55 -0500 Subject: [PATCH 07/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 51dba66..3a24cec 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -47,7 +47,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: performance-results - path: 'src/SharpVectorPerformance/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' + path: 'SharpVectorPerformance/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' - name: Upload Nuget artifact uses: actions/upload-artifact@v4 From d28d347cbab219d8d0ab5e299441acad5441d23d Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:46:39 -0500 Subject: [PATCH 08/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 3a24cec..34d5960 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -47,7 +47,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: performance-results - path: 'SharpVectorPerformance/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' + path: 'BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' - name: Upload Nuget artifact uses: actions/upload-artifact@v4 From b68a6d9bcb0f1359c02bc8be00c33b632361c1fc Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:50:31 -0500 Subject: [PATCH 09/33] Update build-release.yml --- .github/workflows/build-release.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 34d5960..1bd9bdb 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -43,6 +43,9 @@ jobs: # name: release-build # path: ./publish + - name: List Files + run: ls -R + - name: Upload Performance artifact uses: actions/upload-artifact@v4 with: From 139330f8e49535b5d2c4054c1afc1b372b59f56c Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:55:26 -0500 Subject: [PATCH 10/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 1bd9bdb..c36912e 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -50,7 +50,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: performance-results - path: 'BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' + path: './BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' - name: Upload Nuget artifact uses: actions/upload-artifact@v4 From 4d3d2c552078cb0dcca11c2af55c7a7538e958de Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:58:49 -0500 Subject: [PATCH 11/33] Update build-release.yml --- .github/workflows/build-release.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index c36912e..55c32ed 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -43,14 +43,14 @@ jobs: # name: release-build # path: ./publish - - name: List Files - run: ls -R + # - name: List Files + # run: ls -R - name: Upload Performance artifact uses: actions/upload-artifact@v4 with: name: performance-results - path: './BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' + path: './src/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' - name: Upload Nuget artifact uses: actions/upload-artifact@v4 From 8cf70e1dde5eea3d5c5591dd9c53615fc7960960 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 15:59:44 -0500 Subject: [PATCH 12/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 55c32ed..82cc70c 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -2,8 +2,6 @@ name: Build and Release on: push: - branches: - - main pull_request: branches: - main From 82239286c9798fefd2e6b181b5b3a670488e9f48 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:05:38 -0500 Subject: [PATCH 13/33] output perf results to GitHub Action step summary --- .github/workflows/build-release.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 82cc70c..6fc00b6 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -5,6 +5,7 @@ on: pull_request: branches: - main + - dev workflow_dispatch: jobs: @@ -44,11 +45,14 @@ jobs: # - name: List Files # run: ls -R + - name: Performance Results + run: echo ./src/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY + - name: Upload Performance artifact uses: actions/upload-artifact@v4 with: name: performance-results - path: './src/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md' + path: './src/BenchmarkDotNet.Artifacts/*' - name: Upload Nuget artifact uses: actions/upload-artifact@v4 From d2cd4c683f692b0b36f126d262f9411b46ea1c9b Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:06:03 -0500 Subject: [PATCH 14/33] add CHANGELOG.md --- CHANGELOG.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 48 -------------------------------------- 2 files changed, 65 insertions(+), 48 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8a5be6d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,65 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## v2.0.0 (In Progress) + +Added: + +- Add data persistence capability + +Breaking Change: + +- Refactor `IVocabularyStore` to be used within `MemoryDictionaryVectorStoreWithVocabulary`. This simplifies implementation of `MemoryVectorDatabaseBase`, and helps to enable data persistence capability. + +Notes: + +- The breaking change only applies if the base classes are being used. If the `BasicMemoryVectorDatabase` is being used, this will likely not break applications that depend on this library. However, in some instances where explicitly depending on `VectorTextResult` it's properties (without using `var` in consuming code) there might be minor code changes needed when migrating from previous versions of the library. + +## v1.0.1 (2025-02-06) + +- Upgrade to .NET 8 or higher + +### v1.0.0 (2024-05-24) + +Added: + +- Simplify object model by combining Async and non-Async classes, `BasicMemoryVectorDatabase` now support both synchronous and asynchronous operations. +- Refactored to remove unnecessary classes where the `Async` versions will work just fine. +- Improve async/await and multi-threading use + +### v0.9.8-beta (2024-05-20) + +Added: + +- Added `Async` version of classes to support multi-threading +- Metadata is no longer required when calling `.AddText()` and `.AddTextAsync()` +- Refactor `IVectorSimilarityCalculator` to `IVectorComparer` and `CosineVectorSimilarityCalculatorAsync` to `CosineSimilarityVectorComparerAsync` +- Add new `EuclideanDistanceVectorComparerAsync` +- Fix `MemoryVectorDatabase` to no longer requird unused `TId` generic type +- Rename `VectorSimilarity` and `Similarity` properties to `VectorComparison` + +### v0.9.5-beta (2024-05-18) + +Added: + +- Add `TextDataLoader` class to provide support for different methods of text chunking when loading documents into the vector database. + +### v0.9.0-beta (2024-05-18) + +Added: + +- Introduced the `BasicMemoryVectorDatabase` class as the basic Vector Database implementations that uses a Bag of Words vectorization strategy, with Cosine similarity, a dictionary vocabulary store, and a basic text preprocessor. +- Add more C# Generics use, so the library is more customizable when used, and custom vector databases can be implemented if desired. +- Added `VectorTextResultItem.Similarity` so consuming code can inspect similarity of the Text in the vector search results. +- Update `.Search` method to support search result paging and threshold support for similarity comparison +- Add some basic Unit Tests + +### v0.8.0-beta (2024-05-17) + +Added: + +- Initial release - let's do this! diff --git a/README.md b/README.md index 572da8e..5b3e71c 100644 --- a/README.md +++ b/README.md @@ -138,54 +138,6 @@ Here's a screenshot of the test console app running: ![](assets/build5nines-sharpvector-console-screenshot.jpg) -## Change Log - -## v2.0.0 (In Progress) - -Feature: -- Add data persistence capability - -Breaking Change: -- Refactor `IVocabularyStore` to be used within `MemoryDictionaryVectorStoreWithVocabulary`. This simplifies implementation of `MemoryVectorDatabaseBase`, and helps to enable data persistence capability. - -Notes: -- The breaking change only applies if the base classes are being used. If the `BasicMemoryVectorDatabase` is being used, this will likely not break applications that depend on this library. However, in some instances where explicitly depending on `VectorTextResult` it's properties (without using `var` in consuming code) there might be minor code changes needed when migrating from previous versions of the library. - -## v1.0.1 (2025-02-06) - -- Upgrade to .NET 8 or higher - -### v1.0.0 (2024-05-24) - -- Simplify object model by combining Async and non-Async classes, `BasicMemoryVectorDatabase` now support both synchronous and asynchronous operations. -- Refactored to remove unnecessary classes where the `Async` versions will work just fine. -- Improve async/await and multi-threading use - -### v0.9.8-beta (2024-05-20) - -- Added `Async` version of classes to support multi-threading -- Metadata is no longer required when calling `.AddText()` and `.AddTextAsync()` -- Refactor `IVectorSimilarityCalculator` to `IVectorComparer` and `CosineVectorSimilarityCalculatorAsync` to `CosineSimilarityVectorComparerAsync` -- Add new `EuclideanDistanceVectorComparerAsync` -- Fix `MemoryVectorDatabase` to no longer requird unused `TId` generic type -- Rename `VectorSimilarity` and `Similarity` properties to `VectorComparison` - -### v0.9.5-beta (2024-05-18) - -- Add `TextDataLoader` class to provide support for different methods of text chunking when loading documents into the vector database. - -### v0.9.0-beta (2024-05-18) - -- Introduced the `BasicMemoryVectorDatabase` class as the basic Vector Database implementations that uses a Bag of Words vectorization strategy, with Cosine similarity, a dictionary vocabulary store, and a basic text preprocessor. -- Add more C# Generics use, so the library is more customizable when used, and custom vector databases can be implemented if desired. -- Added `VectorTextResultItem.Similarity` so consuming code can inspect similarity of the Text in the vector search results. -- Update `.Search` method to support search result paging and threshold support for similarity comparison -- Add some basic Unit Tests - -### v0.8.0-beta (2024-05-17) - -- Initial release - let's do this! - ## Maintained By The **Build5Nines SharpVector** project is maintained by [Chris Pietschmann](https://pietschsoft.com?utm_source=github&utm_medium=sharpvector), founder of [Build5Nines](https://build5nines.com?utm_source=github&utm_medium=sharpvector), Microsoft MVP, HashiCorp Ambassador, and Microsoft Certified Trainer (MCT). From 513e67c7502530baac3b1c612aaa6ee183de8d5b Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:06:49 -0500 Subject: [PATCH 15/33] Update build-release.yml --- .github/workflows/build-release.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 6fc00b6..cd9153f 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -30,7 +30,7 @@ jobs: - name: Build run: dotnet build --configuration Release --no-restore - - name: Performance + - name: Performance Test run: dotnet run --project SharpVectorPerformance --configuration Release # - name: Publish @@ -42,9 +42,6 @@ jobs: # name: release-build # path: ./publish - # - name: List Files - # run: ls -R - - name: Performance Results run: echo ./src/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY From d28682f7af306377e5a3104c4cc004230bcd806d Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:07:31 -0500 Subject: [PATCH 16/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index cd9153f..ce4d45f 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -43,7 +43,7 @@ jobs: # path: ./publish - name: Performance Results - run: echo ./src/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY + run: cat ./src/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY - name: Upload Performance artifact uses: actions/upload-artifact@v4 From 4b57bd69d7fad86c417db7beea05102c46b66cce Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:07:45 -0500 Subject: [PATCH 17/33] Update build-release.yml --- .github/workflows/build-release.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index ce4d45f..62dc452 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -2,6 +2,9 @@ name: Build and Release on: push: + branches: + - main + - dev pull_request: branches: - main From 8eac458b514c32189ba49a5eb6b92b720c6dbe61 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:09:32 -0500 Subject: [PATCH 18/33] Update build-release.yml --- .github/workflows/build-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 62dc452..209eb12 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -46,7 +46,7 @@ jobs: # path: ./publish - name: Performance Results - run: cat ./src/BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY + run: cat ./BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY - name: Upload Performance artifact uses: actions/upload-artifact@v4 From e740b9ef32253fdc67a7b553e7c1a4398e438dbb Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:13:35 -0500 Subject: [PATCH 19/33] Update Build5Nines.SharpVector.csproj --- src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj index 196bfc4..357d821 100644 --- a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj +++ b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj @@ -16,7 +16,7 @@ https://github.com/Build5Nines/SharpVector/blob/main/LICENSE Chris Pietschmann Build5Nines LLC - vector;search;database;data;rag + vector;search;database;data;rag;search;llm;generative ai;ai;genai From 5964a34753d9425265e04bece666fcbcca306d57 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:13:41 -0500 Subject: [PATCH 20/33] Update build-release.yml --- .github/workflows/build-release.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 209eb12..d91d4a5 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -46,7 +46,9 @@ jobs: # path: ./publish - name: Performance Results - run: cat ./BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY + run: | + echo "## Performance Results" > $GITHUB_STEP_SUMMARY + cat ./BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY - name: Upload Performance artifact uses: actions/upload-artifact@v4 From f780fe40136f13656abcf355ff4ff56d0c36dc22 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:15:47 -0500 Subject: [PATCH 21/33] Update VectorDatabaseTests.cs --- src/SharpVectorTest/VectorDatabaseTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index fbf8dea..7db96b3 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -531,7 +531,7 @@ public async Task SaveLoadFile_002() timer.Stop(); Console.WriteLine($"SaveLoadFile_002: Save File: {timer.ElapsedMilliseconds} ms"); - Assert.IsTrue(timer.ElapsedMilliseconds < 200, $"SaveLoadFile_002: Save File took too long - Expected: < 200 - Actual: {timer.ElapsedMilliseconds} ms"); + Assert.IsTrue(timer.ElapsedMilliseconds < 300, $"SaveLoadFile_002: Save File took too long - Expected: < 300 - Actual: {timer.ElapsedMilliseconds} ms"); var databaseTwo = new MemoryVectorDatabase(); timer.Restart(); @@ -539,7 +539,7 @@ public async Task SaveLoadFile_002() timer.Stop(); Console.WriteLine($"SaveLoadFile_002: Load File: {timer.ElapsedMilliseconds} ms"); - Assert.IsTrue(timer.ElapsedMilliseconds < 200, $"SaveLoadFile_002: Load File took too long - Expected: < 200 - Actual: {timer.ElapsedMilliseconds} ms"); + Assert.IsTrue(timer.ElapsedMilliseconds < 300, $"SaveLoadFile_002: Load File took too long - Expected: < 300 - Actual: {timer.ElapsedMilliseconds} ms"); var secondResult = await databaseTwo.SearchAsync("Lion King", pageCount: 5); Assert.AreEqual(5, secondResult.Texts.Count()); From 7c72b5c294d3a4c26914221e2f7626cab924a702 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 16:16:59 -0500 Subject: [PATCH 22/33] Update VectorDatabaseTests.cs --- src/SharpVectorTest/VectorDatabaseTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index 7db96b3..01a6b79 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -531,6 +531,7 @@ public async Task SaveLoadFile_002() timer.Stop(); Console.WriteLine($"SaveLoadFile_002: Save File: {timer.ElapsedMilliseconds} ms"); + // This is a smoke test to just make sure that the save file is not taking way longer than expected Assert.IsTrue(timer.ElapsedMilliseconds < 300, $"SaveLoadFile_002: Save File took too long - Expected: < 300 - Actual: {timer.ElapsedMilliseconds} ms"); var databaseTwo = new MemoryVectorDatabase(); @@ -539,6 +540,7 @@ public async Task SaveLoadFile_002() timer.Stop(); Console.WriteLine($"SaveLoadFile_002: Load File: {timer.ElapsedMilliseconds} ms"); + // This is a smoke test to just make sure that the load file is not taking way longer than expected Assert.IsTrue(timer.ElapsedMilliseconds < 300, $"SaveLoadFile_002: Load File took too long - Expected: < 300 - Actual: {timer.ElapsedMilliseconds} ms"); var secondResult = await databaseTwo.SearchAsync("Lion King", pageCount: 5); From 72e24202a1e85b7429c766109a7a983093fcfe5f Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 17:40:05 -0500 Subject: [PATCH 23/33] Add DatabaseFile.Load static methods and DatabaseFile.LoadDatabaseInfoAsync method --- src/Build5Nines.SharpVector/DatabaseFile.cs | 152 ++++++++++++++++++ .../DatabaseFileException.cs | 97 +++++++++++ src/Build5Nines.SharpVector/DatabaseInfo.cs | 25 ++- .../IVectorDatabaseExtensions.cs | 16 +- .../MemoryVectorDatabaseBase.cs | 40 ++--- src/SharpVectorTest/VectorDatabaseTests.cs | 141 ++++++++++++++++ 6 files changed, 440 insertions(+), 31 deletions(-) create mode 100644 src/Build5Nines.SharpVector/DatabaseFile.cs create mode 100644 src/Build5Nines.SharpVector/DatabaseFileException.cs diff --git a/src/Build5Nines.SharpVector/DatabaseFile.cs b/src/Build5Nines.SharpVector/DatabaseFile.cs new file mode 100644 index 0000000..142d7e9 --- /dev/null +++ b/src/Build5Nines.SharpVector/DatabaseFile.cs @@ -0,0 +1,152 @@ + +using System.IO.Compression; +using System.Text.Json; + +namespace Build5Nines.SharpVector; + +public static class DatabaseFile +{ + /// + /// Load the vector database from a stream + /// + /// + /// + /// + public static async Task> Load(Stream stream) + { + return await Load, TMetadata>(stream); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + /// + public static async Task Load(Stream stream) + where TVectorDatabase : MemoryVectorDatabase, new() + { + return await Load(stream); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + public static async Task Load(Stream stream) + where TVectorDatabase : IVectorDatabase, new() + where TId : notnull + { + var vdb = new TVectorDatabase(); + return await Load(vdb, stream); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + public static async Task Load(TVectorDatabase vdb, Stream stream) + where TVectorDatabase : IVectorDatabase + where TId : notnull + { + await vdb.DeserializeFromJsonStreamAsync(stream); + return vdb; + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + public static async Task> Load(string filePath) + { + return await Load, TMetadata>(filePath); + } + + /// + /// Load the vector database from a stream + /// + /// + /// + /// + /// + public static async Task Load(string filePath) + where TVectorDatabase : MemoryVectorDatabase, new() + { + return await Load(filePath); + } + + /// + /// Load the vector database from a file + /// + /// + /// + public static async Task Load(string filePath) + where TVectorDatabase: IVectorDatabase, new() + where TId : notnull + { + var vdb = new TVectorDatabase(); + return await Load(vdb, filePath); + } + + /// + /// Load the vector database from a file + /// + /// + /// + /// + public static async Task Load(TVectorDatabase vdb, string filePath) + where TVectorDatabase : IVectorDatabase + where TId : notnull + { + await vdb.LoadFromFileAsync(filePath); + return vdb; + } + + public static async Task LoadDatabaseInfoAsync(Stream stream) + { + using (var archive = new ZipArchive(stream, ZipArchiveMode.Read)) + { + var entryDatabaseType = archive.GetEntry("database.json"); + if (entryDatabaseType != null) + { + using (var entryStream = entryDatabaseType.Open()) + { + var databaseTypeStream = new MemoryStream(); + await entryStream.CopyToAsync(databaseTypeStream); + databaseTypeStream.Position = 0; + + var databaseTypeBytes = new byte[databaseTypeStream.Length]; + await databaseTypeStream.ReadAsync(databaseTypeBytes); + var databaseInfoJson = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + + var databaseInfo = JsonSerializer.Deserialize(databaseInfoJson); + + if (databaseInfo == null) + { + throw new DatabaseFileInfoException("Database info entry is null."); + } + + return databaseInfo; + } + } + else + { + throw new DatabaseFileMissingEntryException("Database info entry not found.", "database"); + } + } + } + + public static async Task LoadDatabaseInfoAsync(string filePath) + { + using (var stream = File.OpenRead(filePath)) + { + return await LoadDatabaseInfoAsync(stream); + } + } +} \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/DatabaseFileException.cs b/src/Build5Nines.SharpVector/DatabaseFileException.cs new file mode 100644 index 0000000..b5468af --- /dev/null +++ b/src/Build5Nines.SharpVector/DatabaseFileException.cs @@ -0,0 +1,97 @@ +namespace Build5Nines.SharpVector; + +public class DatabaseFileException : Exception +{ + public DatabaseFileException() + { + } + + public DatabaseFileException(string message) + : base(message) + { + } + + public DatabaseFileException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileInfoException : DatabaseFileException +{ + public DatabaseFileInfoException() + { + } + + public DatabaseFileInfoException(string message) + : base(message) + { + } + + public DatabaseFileInfoException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileSchemaException : DatabaseFileException +{ + public DatabaseFileSchemaException() + { + } + + public DatabaseFileSchemaException(string message) + : base(message) + { + } + + public DatabaseFileSchemaException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileVersionException : DatabaseFileException +{ + public DatabaseFileVersionException() + { + } + + public DatabaseFileVersionException(string message) + : base(message) + { + } + + public DatabaseFileVersionException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileClassTypeException : DatabaseFileException +{ + public DatabaseFileClassTypeException() + { + } + + public DatabaseFileClassTypeException(string message) + : base(message) + { + } + + public DatabaseFileClassTypeException(string message, Exception innerException) + : base(message, innerException) + { + } +} + +public class DatabaseFileMissingEntryException : DatabaseFileException +{ + public DatabaseFileMissingEntryException(string message, string missingEntry) + : base(message) + { + MissingEntry = missingEntry; + } + + public string MissingEntry { get; private set; } +} \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/DatabaseInfo.cs b/src/Build5Nines.SharpVector/DatabaseInfo.cs index aadde0d..ccd8866 100644 --- a/src/Build5Nines.SharpVector/DatabaseInfo.cs +++ b/src/Build5Nines.SharpVector/DatabaseInfo.cs @@ -1,6 +1,25 @@ -internal class DatabaseInfo +namespace Build5Nines.SharpVector; + +public class DatabaseInfo { - public string? Schema { get; set; } = "Build5Nines.SharpVector"; + internal static string SupportedVersion = "1.0.0"; + internal static string SupportedSchema = "Build5Nines.SharpVector"; + + public DatabaseInfo() + : this(null, null, null) + { } + public DatabaseInfo(string? classType) + : this(SupportedSchema, SupportedVersion, classType) + { } + + public DatabaseInfo(string? schema, string? version, string? classType) + { + Schema = schema; + Version = version; + ClassType = classType; + } + + public string? Schema { get; set; } public string? Version { get; set; } - public string? VectorDatabaseClassType { get; set; } + public string? ClassType { get; set; } } \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs b/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs index 83b9bcf..13a1ce4 100644 --- a/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs +++ b/src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs @@ -2,37 +2,37 @@ namespace Build5Nines.SharpVector; public static class IVectorDatabaseExtensions { - public static async Task SaveToFileAsync(this IVectorDatabase vectorDatabase, string filename) + public static async Task SaveToFileAsync(this IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Create, FileAccess.Write)) + using (var stream = new FileStream(filePath, FileMode.Create, FileAccess.Write)) { await vectorDatabase.SerializeToJsonStreamAsync(stream); } } - public static void SaveToFile(this IVectorDatabase vectorDatabase, string filename) + public static void SaveToFile(this IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Create, FileAccess.Write)) + using (var stream = new FileStream(filePath, FileMode.Create, FileAccess.Write)) { vectorDatabase.SerializeToJsonStream(stream); } } - public static async Task LoadFromFileAsync(this IVectorDatabase vectorDatabase, string filename) + public static async Task LoadFromFileAsync(this IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + using (var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) { await vectorDatabase.DeserializeFromJsonStreamAsync(stream); } } - public static void LoadFromFile(IVectorDatabase vectorDatabase, string filename) + public static void LoadFromFile(IVectorDatabase vectorDatabase, string filePath) where TId : notnull { - using (var stream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + using (var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) { vectorDatabase.DeserializeFromJsonStream(stream); } diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs index 8b22742..479a179 100644 --- a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs @@ -255,13 +255,11 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) using (var archive = new ZipArchive(stream, ZipArchiveMode.Create, true)) { - var entryDatabaseType = archive.CreateEntry("Database.json"); + var entryDatabaseType = archive.CreateEntry("database.json"); using (var entryStream = entryDatabaseType.Open()) { - var databaseInfo = new DatabaseInfo { - Version = "1.0.0", - VectorDatabaseClassType = this.GetType().FullName - }; + var databaseInfo = new DatabaseInfo(this.GetType().FullName); + var databaseInfoJson = JsonSerializer.Serialize(databaseInfo); if (databaseInfoJson != null) @@ -275,7 +273,7 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) throw new InvalidOperationException("Type name cannot be null."); } } - var entryVectorStore = archive.CreateEntry("VectorStore.json"); + var entryVectorStore = archive.CreateEntry("vectorstore.json"); using (var entryStream = entryVectorStore.Open()) { streamVectorStore.Position = 0; @@ -283,7 +281,7 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) await entryStream.FlushAsync(); } - var entryVocabularyStore = archive.CreateEntry("VocabularyStore.json"); + var entryVocabularyStore = archive.CreateEntry("vocabularystore.json"); using (var entryStream = entryVocabularyStore.Open()) { streamVocabularyStore.Position = 0; @@ -313,7 +311,7 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) using (var archive = new ZipArchive(stream, ZipArchiveMode.Read)) { - var entryDatabaseType = archive.GetEntry("Database.json"); + var entryDatabaseType = archive.GetEntry("database.json"); if (entryDatabaseType != null) { using (var entryStream = entryDatabaseType.Open()) @@ -330,31 +328,32 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) if (databaseInfo == null) { - throw new InvalidOperationException("Database info entry is null."); + throw new DatabaseFileInfoException("Database info entry is null."); } - if (databaseInfo.Schema != "Build5Nines.SharpVector") + if (databaseInfo.Schema != DatabaseInfo.SupportedSchema) { - throw new InvalidOperationException("The database schema does not match the expected schema."); + throw new DatabaseFileSchemaException($"The database schema does not match the expected schema (Expected: {DatabaseInfo.SupportedSchema} - Actual: {databaseInfo.Schema})."); } - if (databaseInfo.Version != "1.0.0") + if (databaseInfo.Version != DatabaseInfo.SupportedVersion) { - throw new InvalidOperationException($"The database version does not match the expected version (Expected: 1.0.0 - Actual: {databaseInfo.Version})."); + throw new DatabaseFileVersionException($"The database version does not match the expected version (Expected: {DatabaseInfo.SupportedVersion} - Actual: {databaseInfo.Version})."); } - if (databaseInfo.VectorDatabaseClassType != this.GetType().FullName) + if (databaseInfo.ClassType != this.GetType().FullName) { - throw new InvalidOperationException($"The database type does not match the expected type [Expected: {databaseInfo.VectorDatabaseClassType}] "); + throw new DatabaseFileClassTypeException($"The database class type does not match the expected type (Expected: {this.GetType().FullName} - Actual: {databaseInfo.ClassType})"); } } } else { - throw new InvalidOperationException("Database type entry not found."); + throw new DatabaseFileMissingEntryException("Database info entry not found.", "database"); } - var entryVectorStore = archive.GetEntry("VectorStore.json"); + + var entryVectorStore = archive.GetEntry("vectorstore.json"); if (entryVectorStore != null) { using (var entryStream = entryVectorStore.Open()) @@ -364,10 +363,10 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) } else { - throw new InvalidOperationException("Vector Store entry not found."); + throw new DatabaseFileMissingEntryException("Vector Store entry not found.", "vectorstore"); } - var entryVocabularyStore = archive.GetEntry("VocabularyStore.json"); + var entryVocabularyStore = archive.GetEntry("vocabularystore.json"); if (entryVocabularyStore != null) { using (var entryStream = entryVocabularyStore.Open()) @@ -377,8 +376,9 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) } else { - throw new InvalidOperationException("Vocabulary Store entry not found."); + throw new DatabaseFileMissingEntryException("Vocabulary Store entry not found.", "vocabularystore"); } + } } diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index 01a6b79..d24c74e 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -1,6 +1,7 @@ namespace SharpVectorTest; using System.Diagnostics; +using System.Threading.Tasks; using Build5Nines.SharpVector; using Build5Nines.SharpVector.Id; using Build5Nines.SharpVector.Preprocessing; @@ -557,6 +558,146 @@ public async Task SaveLoadFile_002() Assert.AreEqual("NewNewNew", thirdResult.Texts.First().Text); Assert.AreEqual(4.5, thirdResult.Texts.First().Metadata); } + + [TestMethod] + public async Task DatabaseFile_LoadDatabaseInfo_001() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + vdb.SaveToFile("DatabaseFile_LoadDatabaseInfo_001.b59vdb"); + + var databaseInfo = await DatabaseFile.LoadDatabaseInfoAsync("DatabaseFile_LoadDatabaseInfo_001.b59vdb"); + + Assert.AreEqual("Build5Nines.SharpVector", databaseInfo.Schema); + Assert.AreEqual("1.0.0", databaseInfo.Version); + Assert.AreEqual("Build5Nines.SharpVector.MemoryVectorDatabase`1[[System.String, System.Private.CoreLib, Version=8.0.0.0, Culture=neutral, PublicKeyToken=7cec85d7bea7798e]]", databaseInfo.ClassType); + } + + [TestMethod] + public async Task DatabaseFile_LoadDatabaseInfo_002() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var stream = new MemoryStream(); + await vdb.SerializeToJsonStreamAsync(stream); + stream.Position = 0; + + var databaseInfo = await DatabaseFile.LoadDatabaseInfoAsync(stream); + + Assert.AreEqual("Build5Nines.SharpVector", databaseInfo.Schema); + Assert.AreEqual("1.0.0", databaseInfo.Version); + Assert.AreEqual("Build5Nines.SharpVector.MemoryVectorDatabase`1[[System.String, System.Private.CoreLib, Version=8.0.0.0, Culture=neutral, PublicKeyToken=7cec85d7bea7798e]]", databaseInfo.ClassType); + } + + [TestMethod] + public async Task DatabaseFile_LoadStream_002() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var stream = new MemoryStream(); + await vdb.SerializeToJsonStreamAsync(stream); + stream.Position = 0; + + vdb = await DatabaseFile.Load, string>(stream); + + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task DatabaseFile_LoadStream_003() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var stream = new MemoryStream(); + await vdb.SerializeToJsonStreamAsync(stream); + stream.Position = 0; + + vdb = await DatabaseFile.Load(stream); + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task DatabaseFile_LoadFile_002() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var filename = "DatabaseFile_LoadFile_003.b59vdb"; + await vdb.SaveToFileAsync(filename); + + vdb = await DatabaseFile.Load, string>(filename); + + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task DatabaseFile_LoadFile_003() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var id = vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"JSON Metadata Value\" }"); + + var filename = "DatabaseFile_LoadFile_003.b59vdb"; + await vdb.SaveToFileAsync(filename); + + vdb = await DatabaseFile.Load(filename); + var results = vdb.Search("Lion King"); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); + Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); + Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison); + + vdb.UpdateTextMetadata(id, "{ value: \"New Value\" }"); + + results = vdb.Search("Lion King"); + Assert.AreEqual("{ value: \"New Value\" }", results.Texts.First().Metadata); + } } From 8f85142bc110700f55eb72cba6348eb8319de6f4 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 17:47:41 -0500 Subject: [PATCH 24/33] add code coverage --- .github/workflows/dotnet-tests.yml | 31 ++++++++++++++++++++++++++++-- .gitignore | 1 + 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dotnet-tests.yml b/.github/workflows/dotnet-tests.yml index 7f283aa..e51f7df 100644 --- a/.github/workflows/dotnet-tests.yml +++ b/.github/workflows/dotnet-tests.yml @@ -1,9 +1,14 @@ name: .NET Core Tests on: + push: + branches: + - main + - dev pull_request: branches: - main + - dev workflow_dispatch: jobs: @@ -28,5 +33,27 @@ jobs: - name: Build run: dotnet build --no-restore - - name: Run tests - run: dotnet test --no-build --verbosity normal \ No newline at end of file + - name: Run tests with code coverage + run: dotnet test --no-build --verbosity normal --collect:"XPlat Code Coverage" + + - name: Code Coverage Results + run: | + # Install the ReportGenerator global tool if not already installed + dotnet tool install -g dotnet-reportgenerator-globaltool || true + export PATH="$PATH:~/.dotnet/tools" + + # Generate a Markdown summary from the coverage file + dotnet reportgenerator \ + -reports:./coverage.cobertura.xml \ + -targetdir:./coverage-output \ + -reporttypes:MarkdownSummary + + # Write header and append the generated report to the GitHub Step Summary + echo "## Code Coverage" > $GITHUB_STEP_SUMMARY + cat ./coverage-output/Report.md >> $GITHUB_STEP_SUMMARY + + - name: Upload test results artifact + uses: actions/upload-artifact@v4 + with: + name: test-results + path: '**/TestResults/**' \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2cde6db..a787c7d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ bin .DS_Store BenchmarkDotNet.Artifacts/ +TestResults/ \ No newline at end of file From 40a1bf20bd295ec4c24b8eeb0d4c673e97ea298b Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 17:59:51 -0500 Subject: [PATCH 25/33] Update dotnet-tests.yml --- .github/workflows/dotnet-tests.yml | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/.github/workflows/dotnet-tests.yml b/.github/workflows/dotnet-tests.yml index e51f7df..054bd74 100644 --- a/.github/workflows/dotnet-tests.yml +++ b/.github/workflows/dotnet-tests.yml @@ -34,26 +34,19 @@ jobs: run: dotnet build --no-restore - name: Run tests with code coverage - run: dotnet test --no-build --verbosity normal --collect:"XPlat Code Coverage" - - - name: Code Coverage Results - run: | - # Install the ReportGenerator global tool if not already installed - dotnet tool install -g dotnet-reportgenerator-globaltool || true - export PATH="$PATH:~/.dotnet/tools" - - # Generate a Markdown summary from the coverage file - dotnet reportgenerator \ - -reports:./coverage.cobertura.xml \ - -targetdir:./coverage-output \ - -reporttypes:MarkdownSummary - - # Write header and append the generated report to the GitHub Step Summary - echo "## Code Coverage" > $GITHUB_STEP_SUMMARY - cat ./coverage-output/Report.md >> $GITHUB_STEP_SUMMARY + run: dotnet test --no-build --verbosity normal --logger trx --results-directory "./TestResults/Coverage/" --collect:"XPlat Code Coverage" + - name: Publish Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + files: | + TestResults/**/*.xml + TestResults/**/*.trx + TestResults/**/*.json + - name: Upload test results artifact uses: actions/upload-artifact@v4 with: name: test-results - path: '**/TestResults/**' \ No newline at end of file + path: '**/TestResults/**' From e374fbfb9c36288696172701a73e9ac94bc4596a Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 18:00:59 -0500 Subject: [PATCH 26/33] Update dotnet-tests.yml --- .github/workflows/dotnet-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dotnet-tests.yml b/.github/workflows/dotnet-tests.yml index 054bd74..b522797 100644 --- a/.github/workflows/dotnet-tests.yml +++ b/.github/workflows/dotnet-tests.yml @@ -41,9 +41,9 @@ jobs: if: always() with: files: | - TestResults/**/*.xml - TestResults/**/*.trx - TestResults/**/*.json + ./TestResults/**/*.xml + ./TestResults/**/*.trx + ./TestResults/**/*.json - name: Upload test results artifact uses: actions/upload-artifact@v4 From 16798e7ca90c88ff796bddcacedb642e8da8028c Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 18:06:37 -0500 Subject: [PATCH 27/33] Update dotnet-tests.yml --- .github/workflows/dotnet-tests.yml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/.github/workflows/dotnet-tests.yml b/.github/workflows/dotnet-tests.yml index b522797..5c655cd 100644 --- a/.github/workflows/dotnet-tests.yml +++ b/.github/workflows/dotnet-tests.yml @@ -34,16 +34,7 @@ jobs: run: dotnet build --no-restore - name: Run tests with code coverage - run: dotnet test --no-build --verbosity normal --logger trx --results-directory "./TestResults/Coverage/" --collect:"XPlat Code Coverage" - - - name: Publish Test Results - uses: EnricoMi/publish-unit-test-result-action@v2 - if: always() - with: - files: | - ./TestResults/**/*.xml - ./TestResults/**/*.trx - ./TestResults/**/*.json + run: dotnet test --no-build --verbosity normal --results-directory "./TestResults/Coverage/" --collect:"XPlat Code Coverage" - name: Upload test results artifact uses: actions/upload-artifact@v4 From 813792571e02e41ceac08c3b2339d4f4fbeb60c1 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 18:48:04 -0500 Subject: [PATCH 28/33] added a couple Chinese character unit tests --- src/SharpVectorTest/VectorDatabaseTests.cs | 56 ++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index d24c74e..6b6c055 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -304,6 +304,62 @@ public void Text_Update_01() Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); } + [TestMethod] + public void Text_Update_01_Chinese() + { + var vdb = new MemoryVectorDatabase(); + + // Load Vector Database with Chinese sample text and JSON metadata + var id = vdb.AddText("狮子王是一部1994年的迪士尼动画电影,讲述一个小狮子辛巴必将继承非洲大草原王位的故事。", "{ value: \"元数据初始值\" }"); + + // Verify that search returns the expected text + var results = vdb.Search("狮子"); + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("狮子王")); + + // Update the text + vdb.UpdateText(id, "狮子王是一部非常棒的电影!"); + + // Verify that the text is updated but the metadata remains unchanged + results = vdb.Search("狮子"); + Assert.AreEqual("狮子王是一部非常棒的电影!", results.Texts.First().Text); + Assert.AreEqual("{ value: \"元数据初始值\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public void Text_Update_01_English_and_Chinese() + { + var vdb = new MemoryVectorDatabase(); + + // Load the Vector Database with some initial sample texts. + vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"Lion King Metadata\" }"); + vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic about a street urchin who finds a magic lamp.", "{ value: \"Aladdin Metadata\" }"); + vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's classic animated film about Ariel.", "{ value: \"Little Mermaid Metadata\" }"); + + // Add additional texts to the database. + vdb.AddText("Mulan is an epic tale of bravery and honor in ancient China.", "{ value: \"Mulan Metadata\" }"); + vdb.AddText("Crouching Tiger, Hidden Dragon is a martial arts masterpiece with breathtaking scenes.", "{ value: \"Crouching Metadata\" }"); + vdb.AddText("In the Mood for Love is a visually stunning film about forbidden romance.", "{ value: \"In the Mood Metadata\" }"); + + // Add more Chinese texts. + vdb.AddText("大闹天宫是一部经典的中国动画电影,讲述孙悟空大闹天宫的故事。", "{ value: \"元数据新增1\" }"); + vdb.AddText("霸王别姬是一部关于爱与背叛的中国史诗电影。", "{ value: \"元数据新增2\" }"); + + // Verify that a search for "Lion King" returns the expected result. + var lionResults = vdb.Search("Lion King"); + Assert.IsTrue(lionResults.Texts.Any(t => t.Text.Contains("Lion King"))); + Assert.AreEqual("{ value: \"Lion King Metadata\" }", lionResults.Texts.First().Metadata); + + // Verify that the Chinese texts were added. + var daNaoResults = vdb.Search("大闹天宫"); + Assert.IsTrue(daNaoResults.Texts.Any(t => t.Text.Contains("大闹天宫"))); + Assert.AreEqual("{ value: \"元数据新增1\" }", daNaoResults.Texts.First().Metadata); + + var baiJieResults = vdb.Search("霸王别姬"); + Assert.IsTrue(baiJieResults.Texts.Any(t => t.Text.Contains("霸王别姬"))); + Assert.AreEqual("{ value: \"元数据新增2\" }", baiJieResults.Texts.First().Metadata); + } + [TestMethod] public void Text_Metadata_String_01() { From 923af239574cda08b92b2666bfa2b149c2c35d07 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 18:59:40 -0500 Subject: [PATCH 29/33] Comment out the Chinese character tests for now due to issue #8 --- src/SharpVectorTest/VectorDatabaseTests.cs | 110 ++++++++++----------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index 6b6c055..58a943b 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -304,61 +304,61 @@ public void Text_Update_01() Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); } - [TestMethod] - public void Text_Update_01_Chinese() - { - var vdb = new MemoryVectorDatabase(); - - // Load Vector Database with Chinese sample text and JSON metadata - var id = vdb.AddText("狮子王是一部1994年的迪士尼动画电影,讲述一个小狮子辛巴必将继承非洲大草原王位的故事。", "{ value: \"元数据初始值\" }"); - - // Verify that search returns the expected text - var results = vdb.Search("狮子"); - Assert.AreEqual(1, results.Texts.Count()); - Assert.IsTrue(results.Texts.First().Text.Contains("狮子王")); - - // Update the text - vdb.UpdateText(id, "狮子王是一部非常棒的电影!"); - - // Verify that the text is updated but the metadata remains unchanged - results = vdb.Search("狮子"); - Assert.AreEqual("狮子王是一部非常棒的电影!", results.Texts.First().Text); - Assert.AreEqual("{ value: \"元数据初始值\" }", results.Texts.First().Metadata); - } - - [TestMethod] - public void Text_Update_01_English_and_Chinese() - { - var vdb = new MemoryVectorDatabase(); - - // Load the Vector Database with some initial sample texts. - vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"Lion King Metadata\" }"); - vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic about a street urchin who finds a magic lamp.", "{ value: \"Aladdin Metadata\" }"); - vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's classic animated film about Ariel.", "{ value: \"Little Mermaid Metadata\" }"); - - // Add additional texts to the database. - vdb.AddText("Mulan is an epic tale of bravery and honor in ancient China.", "{ value: \"Mulan Metadata\" }"); - vdb.AddText("Crouching Tiger, Hidden Dragon is a martial arts masterpiece with breathtaking scenes.", "{ value: \"Crouching Metadata\" }"); - vdb.AddText("In the Mood for Love is a visually stunning film about forbidden romance.", "{ value: \"In the Mood Metadata\" }"); - - // Add more Chinese texts. - vdb.AddText("大闹天宫是一部经典的中国动画电影,讲述孙悟空大闹天宫的故事。", "{ value: \"元数据新增1\" }"); - vdb.AddText("霸王别姬是一部关于爱与背叛的中国史诗电影。", "{ value: \"元数据新增2\" }"); - - // Verify that a search for "Lion King" returns the expected result. - var lionResults = vdb.Search("Lion King"); - Assert.IsTrue(lionResults.Texts.Any(t => t.Text.Contains("Lion King"))); - Assert.AreEqual("{ value: \"Lion King Metadata\" }", lionResults.Texts.First().Metadata); - - // Verify that the Chinese texts were added. - var daNaoResults = vdb.Search("大闹天宫"); - Assert.IsTrue(daNaoResults.Texts.Any(t => t.Text.Contains("大闹天宫"))); - Assert.AreEqual("{ value: \"元数据新增1\" }", daNaoResults.Texts.First().Metadata); - - var baiJieResults = vdb.Search("霸王别姬"); - Assert.IsTrue(baiJieResults.Texts.Any(t => t.Text.Contains("霸王别姬"))); - Assert.AreEqual("{ value: \"元数据新增2\" }", baiJieResults.Texts.First().Metadata); - } + // [TestMethod] + // public void Text_Update_01_Chinese() + // { + // var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with Chinese sample text and JSON metadata + // var id = vdb.AddText("狮子王是一部1994年的迪士尼动画电影,讲述一个小狮子辛巴必将继承非洲大草原王位的故事。", "{ value: \"元数据初始值\" }"); + + // // Verify that search returns the expected text + // var results = vdb.Search("狮子"); + // Assert.AreEqual(1, results.Texts.Count()); + // Assert.IsTrue(results.Texts.First().Text.Contains("狮子王")); + + // // Update the text + // vdb.UpdateText(id, "狮子王是一部非常棒的电影!"); + + // // Verify that the text is updated but the metadata remains unchanged + // results = vdb.Search("狮子"); + // Assert.AreEqual("狮子王是一部非常棒的电影!", results.Texts.First().Text); + // Assert.AreEqual("{ value: \"元数据初始值\" }", results.Texts.First().Metadata); + // } + + // [TestMethod] + // public void Text_Update_01_English_and_Chinese() + // { + // var vdb = new MemoryVectorDatabase(); + + // // Load the Vector Database with some initial sample texts. + // vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"Lion King Metadata\" }"); + // vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic about a street urchin who finds a magic lamp.", "{ value: \"Aladdin Metadata\" }"); + // vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's classic animated film about Ariel.", "{ value: \"Little Mermaid Metadata\" }"); + + // // Add additional texts to the database. + // vdb.AddText("Mulan is an epic tale of bravery and honor in ancient China.", "{ value: \"Mulan Metadata\" }"); + // vdb.AddText("Crouching Tiger, Hidden Dragon is a martial arts masterpiece with breathtaking scenes.", "{ value: \"Crouching Metadata\" }"); + // vdb.AddText("In the Mood for Love is a visually stunning film about forbidden romance.", "{ value: \"In the Mood Metadata\" }"); + + // // Add more Chinese texts. + // vdb.AddText("大闹天宫是一部经典 的中国动画电影,讲述孙悟空大闹天宫的故事。", "{ value: \"元数据新增1\" }"); + // vdb.AddText("霸王别姬是一部关于 爱与背叛的中国史诗电影。", "{ value: \"元数据新增2\" }"); + + // // Verify that a search for "Lion King" returns the expected result. + // var lionResults = vdb.Search("Lion King"); + // Assert.IsTrue(lionResults.Texts.Any(t => t.Text.Contains("Lion King"))); + // Assert.AreEqual("{ value: \"Lion King Metadata\" }", lionResults.Texts.First().Metadata); + + // // Verify that the Chinese texts were added. + // var daNaoResults = vdb.Search("部经典"); + // Assert.IsTrue(daNaoResults.Texts.Any(t => t.Text.Contains("部经典"))); + // Assert.AreEqual("{ value: \"元数据新增2\" }", daNaoResults.Texts.First().Metadata); + + // var baiJieResults = vdb.Search("霸王别姬"); + // Assert.IsTrue(baiJieResults.Texts.Any(t => t.Text.Contains("霸王别姬"))); + // Assert.AreEqual("{ value: \"元数据新增2\" }", baiJieResults.Texts.First().Metadata); + // } [TestMethod] public void Text_Metadata_String_01() From 1b84a65479d15806ba3bda5a0b473854a1bbcb28 Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sat, 22 Feb 2025 19:12:30 -0500 Subject: [PATCH 30/33] Found a fix for Chinese language/character support (#8) --- CHANGELOG.md | 3 +- .../Preprocessing/BasicTextPreprocessor.cs | 21 +++- src/SharpVectorTest/VectorDatabaseTests.cs | 114 +++++++++--------- 3 files changed, 79 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a5be6d..86367c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Added: -- Add data persistence capability +- Add data persistence capability to save/load from a file or to/from a `Stream` +- Add Chinese language/character support Breaking Change: diff --git a/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs b/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs index 3b878a9..ba94a5e 100644 --- a/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs +++ b/src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs @@ -8,9 +8,24 @@ public class BasicTextPreprocessor : ITextPreprocessor public IEnumerable TokenizeAndPreprocess(string text) { text = text.ToLower(); - text = Regex.Replace(text, @"[^\w\s]", ""); - text = Regex.Replace(text, @"\s+", " ").Trim(); - return text.Split(' ').ToList(); + + // Check if text contains Chinese characters using the CJK Unified Ideographs block + if (Regex.IsMatch(text, @"\p{IsCJKUnifiedIdeographs}")) + { + // Remove punctuation (excluding Chinese characters) + text = Regex.Replace(text, @"[^\p{IsCJKUnifiedIdeographs}\w\s]", ""); + // Tokenize either by matching individual Chinese characters or contiguous word tokens (for Latin letters/digits) + var tokens = Regex.Matches(text, @"[\p{IsCJKUnifiedIdeographs}]|[a-z0-9]+") + .Cast() + .Select(m => m.Value); + return tokens.ToList(); + } + else + { + text = Regex.Replace(text, @"[^\w\s]", ""); + text = Regex.Replace(text, @"\s+", " ").Trim(); + return text.Split(' ').ToList(); + } } public async Task> TokenizeAndPreprocessAsync(string text) diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index 58a943b..58bfa51 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -304,61 +304,65 @@ public void Text_Update_01() Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); } - // [TestMethod] - // public void Text_Update_01_Chinese() - // { - // var vdb = new MemoryVectorDatabase(); - - // // Load Vector Database with Chinese sample text and JSON metadata - // var id = vdb.AddText("狮子王是一部1994年的迪士尼动画电影,讲述一个小狮子辛巴必将继承非洲大草原王位的故事。", "{ value: \"元数据初始值\" }"); - - // // Verify that search returns the expected text - // var results = vdb.Search("狮子"); - // Assert.AreEqual(1, results.Texts.Count()); - // Assert.IsTrue(results.Texts.First().Text.Contains("狮子王")); - - // // Update the text - // vdb.UpdateText(id, "狮子王是一部非常棒的电影!"); - - // // Verify that the text is updated but the metadata remains unchanged - // results = vdb.Search("狮子"); - // Assert.AreEqual("狮子王是一部非常棒的电影!", results.Texts.First().Text); - // Assert.AreEqual("{ value: \"元数据初始值\" }", results.Texts.First().Metadata); - // } - - // [TestMethod] - // public void Text_Update_01_English_and_Chinese() - // { - // var vdb = new MemoryVectorDatabase(); - - // // Load the Vector Database with some initial sample texts. - // vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"Lion King Metadata\" }"); - // vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic about a street urchin who finds a magic lamp.", "{ value: \"Aladdin Metadata\" }"); - // vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's classic animated film about Ariel.", "{ value: \"Little Mermaid Metadata\" }"); - - // // Add additional texts to the database. - // vdb.AddText("Mulan is an epic tale of bravery and honor in ancient China.", "{ value: \"Mulan Metadata\" }"); - // vdb.AddText("Crouching Tiger, Hidden Dragon is a martial arts masterpiece with breathtaking scenes.", "{ value: \"Crouching Metadata\" }"); - // vdb.AddText("In the Mood for Love is a visually stunning film about forbidden romance.", "{ value: \"In the Mood Metadata\" }"); - - // // Add more Chinese texts. - // vdb.AddText("大闹天宫是一部经典 的中国动画电影,讲述孙悟空大闹天宫的故事。", "{ value: \"元数据新增1\" }"); - // vdb.AddText("霸王别姬是一部关于 爱与背叛的中国史诗电影。", "{ value: \"元数据新增2\" }"); - - // // Verify that a search for "Lion King" returns the expected result. - // var lionResults = vdb.Search("Lion King"); - // Assert.IsTrue(lionResults.Texts.Any(t => t.Text.Contains("Lion King"))); - // Assert.AreEqual("{ value: \"Lion King Metadata\" }", lionResults.Texts.First().Metadata); - - // // Verify that the Chinese texts were added. - // var daNaoResults = vdb.Search("部经典"); - // Assert.IsTrue(daNaoResults.Texts.Any(t => t.Text.Contains("部经典"))); - // Assert.AreEqual("{ value: \"元数据新增2\" }", daNaoResults.Texts.First().Metadata); - - // var baiJieResults = vdb.Search("霸王别姬"); - // Assert.IsTrue(baiJieResults.Texts.Any(t => t.Text.Contains("霸王别姬"))); - // Assert.AreEqual("{ value: \"元数据新增2\" }", baiJieResults.Texts.First().Metadata); - // } + [TestMethod] + public void Text_Update_01_Chinese() + { + var vdb = new MemoryVectorDatabase(); + + // Load Vector Database with Chinese sample text and JSON metadata + var id = vdb.AddText("狮子王是一部1994年的迪士尼动画电影,讲述一个小狮子辛巴必将继承非洲大草原王位的故事。", "{ value: \"元数据初始值\" }"); + + // Verify that search returns the expected text + var results = vdb.Search("狮子"); + Assert.AreEqual(1, results.Texts.Count()); + Assert.IsTrue(results.Texts.First().Text.Contains("狮子王")); + + // Update the text + vdb.UpdateText(id, "狮子王是一部非常棒的电影!"); + + // Verify that the text is updated but the metadata remains unchanged + results = vdb.Search("狮子"); + Assert.AreEqual("狮子王是一部非常棒的电影!", results.Texts.First().Text); + Assert.AreEqual("{ value: \"元数据初始值\" }", results.Texts.First().Metadata); + } + + [TestMethod] + public void Text_Update_01_English_and_Chinese() + { + var vdb = new MemoryVectorDatabase(); + + // Load the Vector Database with some initial sample texts. + vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "{ value: \"Lion King Metadata\" }"); + vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic about a street urchin who finds a magic lamp.", "{ value: \"Aladdin Metadata\" }"); + vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's classic animated film about Ariel.", "{ value: \"Little Mermaid Metadata\" }"); + + // Add additional texts to the database. + vdb.AddText("Mulan is an epic tale of bravery and honor in ancient China.", "{ value: \"Mulan Metadata\" }"); + vdb.AddText("Crouching Tiger, Hidden Dragon is a martial arts masterpiece with breathtaking scenes.", "{ value: \"Crouching Metadata\" }"); + vdb.AddText("In the Mood for Love is a visually stunning film about forbidden romance.", "{ value: \"In the Mood Metadata\" }"); + + // Add more Chinese texts. + vdb.AddText("大闹天宫是一部经典 的中国动画电影,讲述孙悟空大闹天宫的故事。", "{ value: \"元数据新增1\" }"); + vdb.AddText("霸王别姬是一部关于 爱与背叛的中国史诗电影。", "{ value: \"元数据新增2\" }"); + + // Verify that a search for "Lion King" returns the expected result. + var lionResults = vdb.Search("Lion King"); + Assert.IsTrue(lionResults.Texts.Any(t => t.Text.Contains("Lion King"))); + Assert.AreEqual("{ value: \"Lion King Metadata\" }", lionResults.Texts.First().Metadata); + + // Verify that the Chinese texts were added. + var daNaoResults = vdb.Search("部经典"); + Assert.IsTrue(daNaoResults.Texts.Any(t => t.Text.Contains("部经典"))); + Assert.AreEqual("{ value: \"元数据新增1\" }", daNaoResults.Texts.First().Metadata); + + var baiJieResults = vdb.Search("霸王别姬"); + Assert.IsTrue(baiJieResults.Texts.Any(t => t.Text.Contains("霸王别姬"))); + Assert.AreEqual("{ value: \"元数据新增2\" }", baiJieResults.Texts.First().Metadata); + + var baiJieResults2 = vdb.Search("宫故事"); + Assert.IsTrue(baiJieResults2.Texts.Any(t => t.Text.Contains("霸王别姬"))); + Assert.AreEqual("{ value: \"元数据新增1\" }", baiJieResults2.Texts.First().Metadata); + } [TestMethod] public void Text_Metadata_String_01() From ff0c1c798df2ec4fda542eefee2dc3d1595026ff Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sun, 23 Feb 2025 08:49:17 -0500 Subject: [PATCH 31/33] update references for SharpVector 2.0.0 --- samples/genai-rag-onnx/genai-rag-onnx.csproj | 10 +- .../Build5Nines.SharpVector.OpenAI.csproj | 2 +- .../OpenAIMemoryVectorDatabaseBase.cs | 133 ++++++++++++++++++ .../OpenAIConsoleTest.csproj | 12 +- .../SharpVectorOpenAITest.csproj | 6 +- 5 files changed, 148 insertions(+), 15 deletions(-) diff --git a/samples/genai-rag-onnx/genai-rag-onnx.csproj b/samples/genai-rag-onnx/genai-rag-onnx.csproj index 20adb58..e07aa3c 100644 --- a/samples/genai-rag-onnx/genai-rag-onnx.csproj +++ b/samples/genai-rag-onnx/genai-rag-onnx.csproj @@ -8,11 +8,11 @@ enable - - - - - + + + + + diff --git a/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj b/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj index 588c6a5..9579dcb 100644 --- a/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj +++ b/src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj @@ -24,7 +24,7 @@ - + diff --git a/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs index 735c0bb..1517a11 100644 --- a/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs @@ -248,4 +248,137 @@ private async Task>> CalculateVector } return results; } + + /// + /// Serializes the Vector Database to a JSON stream + /// + /// + /// + /// + public virtual async Task SerializeToJsonStreamAsync(Stream stream) + { + var streamVectorStore = new MemoryStream(); + var streamVocabularyStore = new MemoryStream(); + + var taskVectorStore = VectorStore.SerializeToJsonStreamAsync(streamVectorStore); + var taskVocabularyStore = VectorStore.VocabularyStore.SerializeToJsonStreamAsync(streamVocabularyStore); + + await Task.WhenAll(taskVectorStore, taskVocabularyStore); + + using (var archive = new ZipArchive(stream, ZipArchiveMode.Create, true)) + { + var entryDatabaseType = archive.CreateEntry("database.json"); + using (var entryStream = entryDatabaseType.Open()) + { + var databaseInfo = new DatabaseInfo(this.GetType().FullName); + + var databaseInfoJson = JsonSerializer.Serialize(databaseInfo); + + if (databaseInfoJson != null) + { + var databaseTypeBytes = System.Text.Encoding.UTF8.GetBytes(databaseInfoJson); + await entryStream.WriteAsync(databaseTypeBytes); + await entryStream.FlushAsync(); + } + else + { + throw new InvalidOperationException("Type name cannot be null."); + } + } + var entryVectorStore = archive.CreateEntry("vectorstore.json"); + using (var entryStream = entryVectorStore.Open()) + { + streamVectorStore.Position = 0; + await streamVectorStore.CopyToAsync(entryStream); + await entryStream.FlushAsync(); + } + } + + await stream.FlushAsync(); + } + + public virtual void SerializeToJsonStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + SerializeToJsonStreamAsync(stream).Wait(); + } + + public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + + using (var archive = new ZipArchive(stream, ZipArchiveMode.Read)) + { + var entryDatabaseType = archive.GetEntry("database.json"); + if (entryDatabaseType != null) + { + using (var entryStream = entryDatabaseType.Open()) + { + var databaseTypeStream = new MemoryStream(); + await entryStream.CopyToAsync(databaseTypeStream); + databaseTypeStream.Position = 0; + + var databaseTypeBytes = new byte[databaseTypeStream.Length]; + await databaseTypeStream.ReadAsync(databaseTypeBytes); + var databaseInfoJson = System.Text.Encoding.UTF8.GetString(databaseTypeBytes); + + var databaseInfo = JsonSerializer.Deserialize(databaseInfoJson); + + if (databaseInfo == null) + { + throw new DatabaseFileInfoException("Database info entry is null."); + } + + if (databaseInfo.Schema != DatabaseInfo.SupportedSchema) + { + throw new DatabaseFileSchemaException($"The database schema does not match the expected schema (Expected: {DatabaseInfo.SupportedSchema} - Actual: {databaseInfo.Schema})."); + } + + if (databaseInfo.Version != DatabaseInfo.SupportedVersion) + { + throw new DatabaseFileVersionException($"The database version does not match the expected version (Expected: {DatabaseInfo.SupportedVersion} - Actual: {databaseInfo.Version})."); + } + + if (databaseInfo.ClassType != this.GetType().FullName) + { + throw new DatabaseFileClassTypeException($"The database class type does not match the expected type (Expected: {this.GetType().FullName} - Actual: {databaseInfo.ClassType})"); + } + } + } + else + { + throw new DatabaseFileMissingEntryException("Database info entry not found.", "database"); + } + + + var entryVectorStore = archive.GetEntry("vectorstore.json"); + if (entryVectorStore != null) + { + using (var entryStream = entryVectorStore.Open()) + { + await VectorStore.DeserializeFromJsonStreamAsync(entryStream); + } + } + else + { + throw new DatabaseFileMissingEntryException("Vector Store entry not found.", "vectorstore"); + } + } + } + + public virtual void DeserializeFromJsonStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + DeserializeFromJsonStreamAsync(stream).Wait(); + } + } \ No newline at end of file diff --git a/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj b/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj index 7763900..ec2c116 100644 --- a/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj +++ b/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj @@ -7,14 +7,14 @@ enable - - - - + + + + - - + + diff --git a/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj b/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj index bf8dd58..f983d11 100644 --- a/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj +++ b/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj @@ -10,7 +10,7 @@ - + @@ -23,8 +23,8 @@ - - + + From 21ebf9decb8868a87226de136ab0b01ea8de619d Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sun, 23 Feb 2025 09:06:14 -0500 Subject: [PATCH 32/33] Update OnnxRuntime references (think I have it working) --- samples/genai-rag-onnx/Program.cs | 5 +++-- samples/genai-rag-onnx/genai-rag-onnx.csproj | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/samples/genai-rag-onnx/Program.cs b/samples/genai-rag-onnx/Program.cs index 0933bc0..5d3471f 100644 --- a/samples/genai-rag-onnx/Program.cs +++ b/samples/genai-rag-onnx/Program.cs @@ -162,11 +162,12 @@ static async Task Main(string[] args) var generatorParams = new GeneratorParams(model); generatorParams.SetSearchOption("max_length", maxPromptLength); generatorParams.SetSearchOption("past_present_share_buffer", false); - generatorParams.SetInputSequences(tokens); + //generatorParams.SetInputSequences(tokens); // Generate the response Console.WriteLine("AI is thinking..."); var generator = new Generator(model, generatorParams); + generator.AppendTokenSequences(tokens); // show in console that the assistant is responding Console.WriteLine(""); @@ -174,7 +175,7 @@ static async Task Main(string[] args) // Output response as each token in generated while (!generator.IsDone()) { - generator.ComputeLogits(); + //generator.ComputeLogits(); generator.GenerateNextToken(); var output = GetOutputTokens(generator, tokenizer); Console.Write(output); diff --git a/samples/genai-rag-onnx/genai-rag-onnx.csproj b/samples/genai-rag-onnx/genai-rag-onnx.csproj index e07aa3c..8bdbc09 100644 --- a/samples/genai-rag-onnx/genai-rag-onnx.csproj +++ b/samples/genai-rag-onnx/genai-rag-onnx.csproj @@ -10,9 +10,9 @@ - - - + + + From dbc8776b080d9db0327bff90a937c3812d2d03fb Mon Sep 17 00:00:00 2001 From: Chris Pietschmann Date: Sun, 23 Feb 2025 09:21:53 -0500 Subject: [PATCH 33/33] Update SharpVector.OpenAI to 2.0.0 with save/load functionality support --- CHANGELOG.md | 4 +-- .../OpenAIMemoryVectorDatabaseBase.cs | 28 ++++++++++--------- src/OpenAIConsoleTest/Program.cs | 5 +--- .../BasicOpenAIMemoryVectorDatabaseTest.cs | 9 ++++++ 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86367c1..5cb7c2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v2.0.0 (In Progress) +## v2.0.0 Added: -- Add data persistence capability to save/load from a file or to/from a `Stream` +- Add data persistence capability to save/load from a file or to/from a `Stream` (Both SharpVector and SharpVector.OpenAI) - Add Chinese language/character support Breaking Change: diff --git a/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs index 1517a11..3c59866 100644 --- a/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabaseBase.cs @@ -3,13 +3,15 @@ using Build5Nines.SharpVector.VectorStore; using System.Collections.Concurrent; using OpenAI.Embeddings; +using System.IO.Compression; +using System.Text.Json; namespace Build5Nines.SharpVector.OpenAI; public abstract class OpenAIMemoryVectorDatabaseBase : IVectorDatabase where TId : notnull - where TVectorStore : IVectorStore + where TVectorStore : IVectorStore where TIdGenerator : IIdGenerator, new() where TVectorComparer : IVectorComparer, new() { @@ -76,7 +78,7 @@ public IEnumerable GetIds() /// /// /// - public IVectorTextItem GetText(TId id) + public IVectorTextItem GetText(TId id) { return VectorStore.Get(id); } @@ -86,7 +88,7 @@ public IVectorTextItem GetText(TId id) /// /// /// - public IVectorTextItem DeleteText(TId id) + public IVectorTextItem DeleteText(TId id) { return VectorStore.Delete(id); } @@ -193,7 +195,7 @@ public void UpdateTextAndMetadata(TId id, string text, TMetadata metadata) /// The highest number of results to show. /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. /// - public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) { return SearchAsync(queryText, threshold, pageIndex, pageCount).Result; } @@ -206,7 +208,7 @@ public IVectorTextResult Search(string queryText, float? threshold = /// The page index of the search results. Default is 0. /// The number of search results per page. Default is Null and returns all results. /// - public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) { var similarities = await CalculateVectorComparisonAsync(queryText, threshold); @@ -260,10 +262,7 @@ public virtual async Task SerializeToJsonStreamAsync(Stream stream) var streamVectorStore = new MemoryStream(); var streamVocabularyStore = new MemoryStream(); - var taskVectorStore = VectorStore.SerializeToJsonStreamAsync(streamVectorStore); - var taskVocabularyStore = VectorStore.VocabularyStore.SerializeToJsonStreamAsync(streamVocabularyStore); - - await Task.WhenAll(taskVectorStore, taskVocabularyStore); + await VectorStore.SerializeToJsonStreamAsync(streamVectorStore); using (var archive = new ZipArchive(stream, ZipArchiveMode.Create, true)) { @@ -330,19 +329,22 @@ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) var databaseInfo = JsonSerializer.Deserialize(databaseInfoJson); + string SupportedVersion = "1.0.0"; + string SupportedSchema = "Build5Nines.SharpVector"; + if (databaseInfo == null) { throw new DatabaseFileInfoException("Database info entry is null."); } - if (databaseInfo.Schema != DatabaseInfo.SupportedSchema) + if (databaseInfo.Schema != SupportedSchema) { - throw new DatabaseFileSchemaException($"The database schema does not match the expected schema (Expected: {DatabaseInfo.SupportedSchema} - Actual: {databaseInfo.Schema})."); + throw new DatabaseFileSchemaException($"The database schema does not match the expected schema (Expected: {SupportedSchema} - Actual: {databaseInfo.Schema})."); } - if (databaseInfo.Version != DatabaseInfo.SupportedVersion) + if (databaseInfo.Version != SupportedVersion) { - throw new DatabaseFileVersionException($"The database version does not match the expected version (Expected: {DatabaseInfo.SupportedVersion} - Actual: {databaseInfo.Version})."); + throw new DatabaseFileVersionException($"The database version does not match the expected version (Expected: {SupportedVersion} - Actual: {databaseInfo.Version})."); } if (databaseInfo.ClassType != this.GetType().FullName) diff --git a/src/OpenAIConsoleTest/Program.cs b/src/OpenAIConsoleTest/Program.cs index 71005af..36d034a 100644 --- a/src/OpenAIConsoleTest/Program.cs +++ b/src/OpenAIConsoleTest/Program.cs @@ -77,15 +77,12 @@ await Parallel.ForEachAsync(movies.EnumerateArray(), async (movie, cancellationT Console.WriteLine(string.Empty); if (newPrompt != null) { - IVectorTextResult result; - var timer = new Stopwatch(); timer.Start(); - var pageSize = 3; // result = await vdb.Search(newPrompt, - result = await vdb.SearchAsync(newPrompt, + var result = await vdb.SearchAsync(newPrompt, threshold: 0.001f, // 0.2f, // Cosine Similarity - Only return results with similarity greater than this threshold // threshold: (float)1.4f, // Euclidean Distance - Only return results with distance less than this threshold diff --git a/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs b/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs index 6779c78..392d98d 100644 --- a/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs +++ b/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs @@ -29,5 +29,14 @@ public void TestInitialization() Assert.IsNotNull(_database); } + [TestMethod] + public async Task Test_SaveLoad_01() + { + var filename = "openai_test_saveload_01.b59vdb"; + await _database.SaveToFileAsync(filename); + + await _database.LoadFromFileAsync(filename); + } + } } \ No newline at end of file