Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v2.0.0
## 2.0.1 (2025-03-17)

Added:

- Expose internal vector array of `VectorTextItem` from `VectorTextResultItem.Vectors` property, to make vector array accessible for consuming code in cases where access is required. This is mostly for more flexible usage of the library.
- Added Overlapping Window text chunking (`TextChunkingMethod.OverlappingWindow`) to `TextDataLoader` for enhanced document segmentation with overlapping content, improving metadata extraction and search result relevance.

Fixed:

- When using `Data.TextDataLoader` with `TextChunkingMethod.FixedLength` it was splitting on a space character which wouldn't work correctly with Chinese text characters. This is now fixed to work correctly with Chinese characters too.

## v2.0.0 (2025-02-23)

Added:

Expand Down
2 changes: 1 addition & 1 deletion src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<PackageId>Build5Nines.SharpVector</PackageId>
<PackageProjectUrl>https://github.com/Build5Nines/SharpVector</PackageProjectUrl>
<RepositoryUrl>https://github.com/Build5Nines/SharpVector</RepositoryUrl>
<Version>2.0.0</Version>
<Version>2.0.1</Version>
<Description>Lightweight In-memory Vector Database to embed in any .NET Applications</Description>
<Copyright>Copyright (c) 2025 Build5Nines LLC</Copyright>
<PackageReadmeFile>README.md</PackageReadmeFile>
Expand Down
6 changes: 5 additions & 1 deletion src/Build5Nines.SharpVector/Data/TextChunkingMethod.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,9 @@ public enum TextChunkingMethod
/// <summary>
/// Split the text into fixed length chunks
/// </summary>
FixedLength
FixedLength,
/// <summary>
/// Split the text into overlapping windows
/// </summary>
OverlappingWindow
}
12 changes: 9 additions & 3 deletions src/Build5Nines.SharpVector/Data/TextChunkingOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ public TextChunkingOptions()
#pragma warning disable CS8603 // Possible null reference return.
RetrieveMetadata = (chunk) => default;
#pragma warning restore CS8603 // Possible null reference return.
OverlapSize = 50;
}

/// <summary>
Expand All @@ -17,13 +18,18 @@ public TextChunkingOptions()
public TextChunkingMethod Method { get; set; }

/// <summary>
/// The size of each chunk of text. Default is 100.
/// Used only for FixedLength method
/// The length in tokens (aka "words") of each chunk of text. Default is 100.
/// Only used by TextChunkingMethod.FixedLength and TextChunkingMethod.OverlappingWindow.
/// </summary>
public int ChunkSize { get; set; }
public int ChunkSize { get; set; }

/// <summary>
/// Lambda function to retrieve custom metadata for each chunk
/// </summary>
public Func<string, TMetadata> RetrieveMetadata { get; set; }

/// <summary>
/// The number of words to overlap text chunks when using using TextChunkingMethod.OverlappingWindow. Default is 50.
/// </summary>
public int OverlapSize { get; set; }
}
60 changes: 57 additions & 3 deletions src/Build5Nines.SharpVector/Data/TextDataLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ namespace Build5Nines.SharpVector.Data;

using System.ComponentModel.DataAnnotations;
using System.Text.RegularExpressions;
using Build5Nines.SharpVector.Preprocessing;

public class TextDataLoader<TId, TMetadata>
where TId : notnull
Expand All @@ -12,6 +13,8 @@ public TextDataLoader(IVectorDatabase<TId, TMetadata> vectorDatabase)
VectorDatabase = vectorDatabase;
}

const string _space = " ";

public IVectorDatabase<TId, TMetadata> VectorDatabase { get; private set; }

public IEnumerable<TId> AddDocument(string document, TextChunkingOptions<TMetadata> chunkingOptions)
Expand Down Expand Up @@ -41,6 +44,8 @@ protected List<string> ChunkText(string text, TextChunkingOptions<TMetadata> chu
return SplitIntoSentences(text);
case TextChunkingMethod.FixedLength:
return SplitIntoChunks(text, chunkingOptions.ChunkSize);
case TextChunkingMethod.OverlappingWindow:
return SplitIntoOverlappingWindows(text, chunkingOptions.ChunkSize, chunkingOptions.OverlapSize);
default:
throw new ArgumentException("Invalid chunking method");
}
Expand All @@ -58,18 +63,67 @@ protected static List<string> SplitIntoSentences(string text)

protected static List<string> SplitIntoChunks(string text, int chunkSize)
{
var words = text.Split(' ');
var words = SplitIntoTokens(text);
var chunks = new List<string>();

const string space = " ";
for (int i = 0; i < words.Length; i += chunkSize)
{
chunks.Add(string.Join(space, words.Skip(i).Take(chunkSize)));
chunks.Add(JoinTokens(words.Skip(i).Take(chunkSize)));
}

return chunks;
}

protected static List<string> SplitIntoOverlappingWindows(string text, int chunkSize, int overlap)
{
var tokens = SplitIntoTokens(text);
var chunks = new List<string>();

if (overlap >= chunkSize)
throw new ArgumentException("Overlap must be smaller than chunk size");

// Calculate the step size
int step = chunkSize - overlap;
int tokenLength = tokens.Length;
for (int i = 0; i < tokenLength; i += step)
{
var chunk = JoinTokens(tokens.Skip(i).Take(chunkSize));
if (!string.IsNullOrWhiteSpace(chunk))
chunks.Add(chunk);

if (i + chunkSize >= tokenLength)
break;
}
return chunks;
}

private static string JoinTokens(IEnumerable<string> tokens)
{
if (tokens == null) return string.Empty;

var fullText = new System.Text.StringBuilder();
foreach (var token in tokens)
{
if (IsChinese(token))
fullText.Append(token);
else
fullText.Append(_space + token);
}
return fullText.ToString().Trim();
}

private static bool IsChinese(string token)
{
// Checks if the token consists entirely of Chinese (CJK Unified Ideograph) characters.
return System.Text.RegularExpressions.Regex.IsMatch(token, @"^\p{IsCJKUnifiedIdeographs}+$");
}

protected static string[] SplitIntoTokens(string text)
{
var processor = new BasicTextPreprocessor();
return processor.TokenizeAndPreprocess(text).ToArray();
}

public async Task<IEnumerable<TId>> AddDocumentAsync(string document, TextChunkingOptions<TMetadata> chunkingOptions)
{
if (chunkingOptions.RetrieveMetadata == null)
Expand Down
5 changes: 5 additions & 0 deletions src/Build5Nines.SharpVector/VectorTextResultItem.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System.Collections.Immutable;

namespace Build5Nines.SharpVector;

public interface IVectorTextResultItem<TDocument, TMetadata>
Expand All @@ -23,6 +25,9 @@ public VectorTextResultItem(IVectorTextItem<TDocument, TMetadata> item, float ve

public TDocument Text { get => _item.Text; }
public TMetadata? Metadata { get => _item.Metadata; }

public ImmutableArray<float> Vectors { get => ImmutableArray.Create(_item.Vector); }

public float VectorComparison { get; private set; }
}

Expand Down
49 changes: 49 additions & 0 deletions src/SharpVectorTest/Data/TextDataLoaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,53 @@ public void TextDataLoader_Paragraphs_01()
Assert.AreEqual("{ chuckSize: \"133\" }", results.Texts.First().Metadata);
Assert.AreEqual(0.3396831452846527, results.Texts.First().VectorComparison);
}

[TestMethod]
public void TextDataLoader_OverlappingWindow_01()
{
var vdb = new BasicMemoryVectorDatabase();

// // Load Vector Database with some sample text
var document = "The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna. \n\n" +
"Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic of the same name about a street urchin who finds a magic lamp and uses a genie's wishes to become a prince so he can marry Princess Jasmine. \n\n" +
"The Little Mermaid is a 2023 live-action adaptation of Disney's 1989 animated film of the same name. The movie is about Ariel, the youngest of King Triton's daughters, who is fascinated by the human world and falls in love with Prince Eric. \n\n" +
"Frozen is a 2013 Disney movie about a fearless optimist named Anna who sets off on a journey to find her sister Elsa, whose icy powers have trapped their kingdom in eternal winter. \n\n" +
"Tangled is a 2010 Disney animated comedy adventure film based on the story of Rapunzel. The movie is about a long-lost princess with magical blonde hair who has been locked in a tower her entire life by Gothel, who wants to use Rapunzel's powers for herself. \n\n" +
"Wreck-It Ralph is a 2012 Disney animated film about Ralph, a character who plays the bad guy in the arcade game Fix-It Felix Jr. for 30 years. Ralph is a muscular, 9-foot-tall character with spiky auburn hair, a pink nose, and large hands and feet. He wears burgundy overalls with a broken strap, a plaid shirt with ripped sleeves, and a teal undershirt. \n\n" +
"Iron Man (2008) is a Marvel Studios action, adventure, and sci-fi movie about Tony Stark (Robert Downey Jr.), a billionaire inventor and weapons developer who is kidnapped by terrorists and forced to build a weapon. Instead, Tony uses his ingenuity to build a high-tech suit of armor and escape, becoming the superhero Iron Man. He then returns to the United States to refine the suit and use it to fight crime and terrorism. \n\n" +
"Black Panther is a 2018 Marvel Studios movie about T'Challa, the heir to the isolated African nation of Wakanda, who returns home to take the throne after his father's death. However, T'Challa faces challenges from within his own country, including Killmonger, who wants to abandon Wakanda's isolationist policies and start a global revolution. T'Challa must team up with C.I.A. agent Everett K. Ross and the Dora Milaje, Wakanda's special forces, to prevent Wakanda from being drawn into a world war. \n\n" +
"Black Panther: Wakanda Forever is a 2022 Marvel movie about the Wakandans fighting to protect their country from world powers after the death of King T'Challa. The movie is a sequel to the popular Black Panther and stars Chadwick Boseman as T'Challa, Letitia Wright as Shuri, Angela Bassett as Ramonda, and Tenoch Huerta Mejía as Namor. \n\n" +
"The Incredible Hulk is a 2008 Marvel movie about scientist Bruce Banner (Edward Norton) who turns into a giant green monster called the Hulk when he's angry or frightened. After a gamma radiation accident, Banner is on the run from the military while searching for a cure for his condition. \n\n" +
"Hackers is a 1995 American crime thriller film about a group of high school hackers who discover a criminal plot to use a computer virus to destroy five oil tankers. The film stars Jonny Lee Miller, Angelina Jolie, Jesse Bradford, Matthew Lillard, Laurence Mason, Renoly Santiago, Lorraine Bracco, and Fisher Stevens. Iain Softley directed the film, which was made during the mid-1990s when the internet was becoming popular. \n\n" +
"WarGames is a 1983 American techno-thriller film about a high school computer hacker who accidentally accesses a top secret military supercomputer that controls the U.S. nuclear arsenal. The hacker, David Lightman (Matthew Broderick), starts a game of Global Thermonuclear War, triggering a false alarm that threatens to start World War III. David must convince the computer that he only wanted to play a game and not the real thing, with help from his girlfriend (Ally Sheedy) and a government official (Dabney Coleman) \n\n" +
"Cars is a 2006 Pixar movie about a rookie race car named Lightning McQueen who gets stranded in a small town while on his way to an important race. McQueen accidentally damages the road in Radiator Springs, a forgotten town on Route 66, and is forced to repair it. While there, he meets Sally, Mater, Doc Hudson, and other characters who help him learn that there's more to life than fame and trophies. McQueen finds friendship and love in the town, and begins to reevaluate his priorities. The movie teaches McQueen the importance of caring for others, integrity, and that winning isn't everything. \n\n" +
"The Incredibles is a 2004 Pixar animated action-adventure film about a family of superheroes who are forced to live a normal suburban life while hiding their powers. The movie is set in a retro-futuristic 1960s and has a runtime of 1 hour and 55 minutes. \n\n" +
"Toy Story is a 1995 animated comedy film about the relationship between Woody, a cowboy doll, and Buzz Lightyear, an action figure. The film takes place in a world where toys come to life when humans are not present. Woody is the leader of the toys in Andy's room, including a Tyrannosaurus Rex and Mr. Potato Head. When Buzz becomes Andy's favorite toy, Woody becomes jealous and plots against him. When Andy's family moves, Woody and Buzz must escape the clutches of their neighbor, Sid Phillips, and reunite with Andy. \n\n" +
"In Toy Story 2, Andy's toys are left to their own devices while he goes to Cowboy Camp, and Woody is kidnapped by a toy collector named Al McWhiggin. Buzz Lightyear and the other toys set out on a rescue mission to save Woody before he becomes a museum toy. \n\n" +
"Iron Man 2 is a 2010 action-adventure fantasy film about Tony Stark (Robert Downey Jr.), a billionaire inventor and superhero who must deal with declining health, government pressure, and a vengeful enemy. \n\n" +
"";

var loader = new TextDataLoader<int, string>(vdb);
loader.AddDocument(document, new TextChunkingOptions<string>
{
Method = TextChunkingMethod.OverlappingWindow,
ChunkSize = 5,
OverlapSize = 2,
RetrieveMetadata = (chunk) => {
// add some basic metadata since this can't be null
return "{ chuckSize: \"" + chunk.Length + "\" }";
}
});

var results = vdb.Search("Lion King", pageCount: 10, threshold: 0.3f);

var texts = results.Texts.ToArray();

Assert.AreEqual(5, results.Texts.Count());
Assert.AreEqual("the lion king is a", texts[0].Text);
Assert.AreEqual("{ chuckSize: \"18\" }", texts[0].Metadata);

Assert.AreEqual("death of king tchalla the", texts[1].Text);
Assert.AreEqual("youngest of king tritons daughters", texts[2].Text);
}
}
Loading