diff --git a/examples/indexer/index-file-loader.php b/examples/indexer/index-file-loader.php index 82a455431..311d40d42 100644 --- a/examples/indexer/index-file-loader.php +++ b/examples/indexer/index-file-loader.php @@ -10,6 +10,7 @@ */ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; +use Symfony\AI\Store\Document\Loader; use Symfony\AI\Store\Document\Loader\TextFileLoader; use Symfony\AI\Store\Document\Transformer\TextReplaceTransformer; use Symfony\AI\Store\Document\Transformer\TextSplitTransformer; @@ -23,21 +24,22 @@ $store = new InMemoryStore(); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); $indexer = new Indexer( - loader: new TextFileLoader(), + loader: new Loader([TextFileLoader::supportedSource() => new TextFileLoader()]), vectorizer: $vectorizer, store: $store, - source: [ - dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', - dirname(__DIR__, 2).'/fixtures/movies/inception.md', - dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', - ], transformers: [ new TextReplaceTransformer(search: '## Plot', replace: '## Synopsis'), new TextSplitTransformer(chunkSize: 500, overlap: 100), ], ); -$indexer->index(); +$sources = TextFileLoader::createSource([ + dirname(__DIR__, 2).'/fixtures/movies/gladiator.md', + dirname(__DIR__, 2).'/fixtures/movies/inception.md', + dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md', +]); + +$indexer->index($sources); $vector = $vectorizer->vectorize('Roman gladiator revenge'); $results = $store->query($vector); diff --git a/examples/indexer/index-inmemory-loader.php b/examples/indexer/index-inmemory-loader.php index 2425bba4b..075c29d4a 100644 --- a/examples/indexer/index-inmemory-loader.php +++ b/examples/indexer/index-inmemory-loader.php @@ -10,8 +10,10 @@ */ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\Source\DocumentCollection; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Transformer\TextSplitTransformer; use Symfony\AI\Store\Document\Vectorizer; @@ -25,7 +27,7 @@ $store = new InMemoryStore(); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); -$documents = [ +$sources = new DocumentCollection([ new TextDocument( Uuid::v4(), 'Artificial Intelligence is transforming the way we work and live. Machine learning algorithms can now process vast amounts of data and make predictions with remarkable accuracy.', @@ -36,19 +38,18 @@ 'Climate change is one of the most pressing challenges of our time. Renewable energy sources like solar and wind power are becoming increasingly important for a sustainable future.', new Metadata(['title' => 'Climate Action']) ), -]; +]); $indexer = new Indexer( - loader: new InMemoryLoader($documents), + loader: new Loader([DocumentCollectionLoader::supportedSource() => new DocumentCollectionLoader()]), vectorizer: $vectorizer, store: $store, - source: null, transformers: [ new TextSplitTransformer(chunkSize: 100, overlap: 20), ], ); -$indexer->index(); +$indexer->index($sources); $vector = $vectorizer->vectorize('machine learning artificial intelligence'); $results = $store->query($vector); diff --git a/examples/indexer/index-rss-loader.php b/examples/indexer/index-rss-loader.php index f21ccfbf9..fb88aa06a 100644 --- a/examples/indexer/index-rss-loader.php +++ b/examples/indexer/index-rss-loader.php @@ -10,6 +10,7 @@ */ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; +use Symfony\AI\Store\Document\Loader; use Symfony\AI\Store\Document\Loader\RssFeedLoader; use Symfony\AI\Store\Document\Transformer\TextSplitTransformer; use Symfony\AI\Store\Document\Vectorizer; @@ -23,19 +24,20 @@ $store = new InMemoryStore(); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); $indexer = new Indexer( - loader: new RssFeedLoader(HttpClient::create()), + loader: new Loader([RssFeedLoader::supportedSource() => new RssFeedLoader(HttpClient::create())]), vectorizer: $vectorizer, store: $store, - source: [ - 'https://feeds.feedburner.com/symfony/blog', - 'https://www.tagesschau.de/index~rss2.xml', - ], transformers: [ new TextSplitTransformer(chunkSize: 500, overlap: 100), ], ); -$indexer->index(); +$sources = RssFeedLoader::createSource([ + 'https://feeds.feedburner.com/symfony/blog', + 'https://www.tagesschau.de/index~rss2.xml', +]); + +$indexer->index($sources); $vector = $vectorizer->vectorize('Week of Symfony'); $results = $store->query($vector); diff --git a/examples/indexer/index-with-filters.php b/examples/indexer/index-with-filters.php index 4f7894375..0325ad628 100644 --- a/examples/indexer/index-with-filters.php +++ b/examples/indexer/index-with-filters.php @@ -11,8 +11,10 @@ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; use Symfony\AI\Store\Document\Filter\TextContainsFilter; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\Source\DocumentCollection; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Transformer\TextTrimTransformer; use Symfony\AI\Store\Document\Vectorizer; @@ -27,7 +29,7 @@ $vectorizer = new Vectorizer($platform, 'text-embedding-3-small'); // Sample documents with some unwanted content -$documents = [ +$documents = new DocumentCollection([ new TextDocument( Uuid::v4(), 'Artificial Intelligence is transforming the way we work and live. Machine learning algorithms can now process vast amounts of data and make predictions with remarkable accuracy.', @@ -48,7 +50,7 @@ 'Climate change is one of the most pressing challenges of our time. Renewable energy sources like solar and wind power are becoming increasingly important for a sustainable future.', new Metadata(['title' => 'Climate Action', 'category' => 'environment']) ), -]; +]); // Create filters to remove unwanted content $filters = [ @@ -57,17 +59,16 @@ ]; $indexer = new Indexer( - loader: new InMemoryLoader($documents), + loader: new Loader([DocumentCollectionLoader::supportedSource() => new DocumentCollectionLoader()]), vectorizer: $vectorizer, store: $store, - source: null, filters: $filters, transformers: [ new TextTrimTransformer(), ], ); -$indexer->index(); +$indexer->index($documents); $vector = $vectorizer->vectorize('technology artificial intelligence'); $results = $store->query($vector); @@ -81,7 +82,7 @@ } echo "=== Results Summary ===\n"; -echo sprintf("Original documents: %d\n", count($documents)); -echo sprintf("Documents after filtering: %d\n", count($results)); -echo sprintf("Filtered out: %d documents\n", count($documents) - count($results)); +echo sprintf("Original documents: %d\n", count($documents->getDocuments())); +echo sprintf("Documents after filtering: %d\n", $i + 1); +echo sprintf("Filtered out: %d documents\n", count($documents->getDocuments()) - ($i + 1)); echo "\nThe 'Week of Symfony' newsletter and SPAM advertisement were successfully filtered out!\n"; diff --git a/examples/memory/mariadb.php b/examples/memory/mariadb.php index d3641098c..9d6023587 100644 --- a/examples/memory/mariadb.php +++ b/examples/memory/mariadb.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\MariaDb\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -57,7 +57,7 @@ // create embeddings for documents as preparation of the chain memory $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, $embeddings = 'text-embedding-3-small'); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); // Execute a chat call that is utilizing the memory diff --git a/examples/ollama/rag.php b/examples/ollama/rag.php index cb9959ad7..21caeb21c 100644 --- a/examples/ollama/rag.php +++ b/examples/ollama/rag.php @@ -17,7 +17,7 @@ use Symfony\AI\Platform\Bridge\Ollama\PlatformFactory; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -43,7 +43,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OLLAMA_HOST_URL'), http_client()); $vectorizer = new Vectorizer($platform, env('OLLAMA_EMBEDDINGS'), logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/cache.php b/examples/rag/cache.php index 3ef15ee06..dbabafa5b 100644 --- a/examples/rag/cache.php +++ b/examples/rag/cache.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Cache\Store as CacheStore; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -44,7 +44,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/chromadb.php b/examples/rag/chromadb.php index ebdb684d0..f5da93ee7 100644 --- a/examples/rag/chromadb.php +++ b/examples/rag/chromadb.php @@ -19,7 +19,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\ChromaDb\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/clickhouse.php b/examples/rag/clickhouse.php index 52d9c8f2e..53d83ed99 100644 --- a/examples/rag/clickhouse.php +++ b/examples/rag/clickhouse.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\ClickHouse\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/cloudflare.php b/examples/rag/cloudflare.php index f51e76637..ba9a377b8 100644 --- a/examples/rag/cloudflare.php +++ b/examples/rag/cloudflare.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Cloudflare\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents (keep in mind that upserting vectors is asynchronous) $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/elasticsearch.php b/examples/rag/elasticsearch.php index a23c076f9..d340c6fba 100644 --- a/examples/rag/elasticsearch.php +++ b/examples/rag/elasticsearch.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Elasticsearch\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -50,7 +50,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/in-memory.php b/examples/rag/in-memory.php index 8aff814b6..182162b08 100644 --- a/examples/rag/in-memory.php +++ b/examples/rag/in-memory.php @@ -17,7 +17,7 @@ use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -43,7 +43,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/manticore.php b/examples/rag/manticore.php index 5f8a569db..df071bdbf 100644 --- a/examples/rag/manticore.php +++ b/examples/rag/manticore.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\ManticoreSearch\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/mariadb-gemini.php b/examples/rag/mariadb-gemini.php index 2c44feb98..6b7672fac 100644 --- a/examples/rag/mariadb-gemini.php +++ b/examples/rag/mariadb-gemini.php @@ -20,7 +20,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\MariaDb\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -53,7 +53,7 @@ $platform = PlatformFactory::create(env('GEMINI_API_KEY'), http_client()); $model = 'gemini-embedding-exp-03-07?dimensions=768&task_type=SEMANTIC_SIMILARITY'; $vectorizer = new Vectorizer($platform, $model, logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/mariadb-openai.php b/examples/rag/mariadb-openai.php index b616958a9..9cebbd727 100644 --- a/examples/rag/mariadb-openai.php +++ b/examples/rag/mariadb-openai.php @@ -20,7 +20,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\MariaDb\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -52,7 +52,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/meilisearch-hybrid.php b/examples/rag/meilisearch-hybrid.php index cb8346b66..9d9e4aa45 100644 --- a/examples/rag/meilisearch-hybrid.php +++ b/examples/rag/meilisearch-hybrid.php @@ -12,7 +12,7 @@ use Symfony\AI\Fixtures\Movies; use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; use Symfony\AI\Store\Bridge\Meilisearch\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -50,7 +50,7 @@ // Create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); // Create a query embedding diff --git a/examples/rag/meilisearch.php b/examples/rag/meilisearch.php index e5ce5742e..df0495c25 100644 --- a/examples/rag/meilisearch.php +++ b/examples/rag/meilisearch.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Meilisearch\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/milvus.php b/examples/rag/milvus.php index 1f8411bf8..0e29e6166 100644 --- a/examples/rag/milvus.php +++ b/examples/rag/milvus.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Milvus\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -52,7 +52,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/mongodb.php b/examples/rag/mongodb.php index 589f38838..871e641cc 100644 --- a/examples/rag/mongodb.php +++ b/examples/rag/mongodb.php @@ -19,7 +19,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\MongoDb\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -50,7 +50,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY')); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); // initialize the index diff --git a/examples/rag/neo4j.php b/examples/rag/neo4j.php index 8a109fbfe..ea04c361c 100644 --- a/examples/rag/neo4j.php +++ b/examples/rag/neo4j.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Neo4j\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -54,7 +54,7 @@ // create embeddings for documents $platform = PlatformFactory::create($_SERVER['OPENAI_API_KEY']); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/opensearch.php b/examples/rag/opensearch.php index a23517fb5..2f792e10e 100644 --- a/examples/rag/opensearch.php +++ b/examples/rag/opensearch.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\OpenSearch\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -50,7 +50,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/pinecone.php b/examples/rag/pinecone.php index 072dd6b96..76cb20653 100644 --- a/examples/rag/pinecone.php +++ b/examples/rag/pinecone.php @@ -19,7 +19,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Pinecone\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -44,7 +44,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/postgres.php b/examples/rag/postgres.php index 5a9dc126a..7f7fe8149 100644 --- a/examples/rag/postgres.php +++ b/examples/rag/postgres.php @@ -20,7 +20,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Postgres\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/qdrant.php b/examples/rag/qdrant.php index 7a142ca53..83ff2f951 100644 --- a/examples/rag/qdrant.php +++ b/examples/rag/qdrant.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Qdrant\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/redis.php b/examples/rag/redis.php index eb682a4ff..5aee17869 100644 --- a/examples/rag/redis.php +++ b/examples/rag/redis.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Redis\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -53,7 +53,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/supabase.php b/examples/rag/supabase.php index c7cdfca64..9317f936c 100644 --- a/examples/rag/supabase.php +++ b/examples/rag/supabase.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Supabase\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -50,7 +50,7 @@ functionName: env('SUPABASE_MATCH_FUNCTION'), $platform = PlatformFactory::create(env('OLLAMA_HOST_URL'), http_client()); $vectorizer = new Vectorizer($platform, env('OLLAMA_EMBEDDINGS')); -$loader = new InMemoryLoader($documents); +$loader = new DocumentCollectionLoader($documents); $indexer = new Indexer($loader, $vectorizer, $store, logger: logger()); $indexer->index(); diff --git a/examples/rag/surrealdb.php b/examples/rag/surrealdb.php index 4a99a39c0..85e07539b 100644 --- a/examples/rag/surrealdb.php +++ b/examples/rag/surrealdb.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\SurrealDb\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -54,7 +54,7 @@ // create embeddings for documents $platform = PlatformFactory::create($_SERVER['OPENAI_API_KEY']); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/typesense.php b/examples/rag/typesense.php index bd2128d02..8b8c723e7 100644 --- a/examples/rag/typesense.php +++ b/examples/rag/typesense.php @@ -18,7 +18,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Typesense\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -51,7 +51,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/rag/weaviate.php b/examples/rag/weaviate.php index 15e14c46f..00226bfd2 100644 --- a/examples/rag/weaviate.php +++ b/examples/rag/weaviate.php @@ -19,7 +19,7 @@ use Symfony\AI\Platform\Message\Message; use Symfony\AI\Platform\Message\MessageBag; use Symfony\AI\Store\Bridge\Weaviate\Store; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -52,7 +52,7 @@ // create embeddings for documents $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index($documents); $similaritySearch = new SimilaritySearch($vectorizer, $store); diff --git a/examples/retriever/movies.php b/examples/retriever/movies.php index 804dc7a15..9d7275fbd 100644 --- a/examples/retriever/movies.php +++ b/examples/retriever/movies.php @@ -11,7 +11,7 @@ use Symfony\AI\Fixtures\Movies; use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; @@ -36,7 +36,7 @@ $platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); $vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); -$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer = new Indexer(new DocumentCollectionLoader($documents), $vectorizer, $store, logger: logger()); $indexer->index(); $retriever = new Retriever($vectorizer, $store, logger()); diff --git a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php index ed2157c22..0629c3494 100644 --- a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php +++ b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php @@ -66,7 +66,7 @@ use Symfony\AI\Store\Distance\DistanceCalculator; use Symfony\AI\Store\Distance\DistanceStrategy; use Symfony\AI\Store\Document\Filter\TextContainsFilter; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Transformer\TextTrimTransformer; use Symfony\AI\Store\Document\Vectorizer; use Symfony\AI\Store\Document\VectorizerInterface; @@ -5094,7 +5094,7 @@ public function testIndexerWithConfiguredVectorizer() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'vectorizer' => 'ai.vectorizer.my_vectorizer', 'store' => 'ai.store.memory.my_store', ], @@ -5109,7 +5109,7 @@ public function testIndexerWithConfiguredVectorizer() $arguments = $indexerDefinition->getArguments(); $this->assertInstanceOf(Reference::class, $arguments[0]); - $this->assertSame(InMemoryLoader::class, (string) $arguments[0]); + $this->assertSame(DocumentCollectionLoader::class, (string) $arguments[0]); $this->assertInstanceOf(Reference::class, $arguments[1]); $this->assertSame('ai.vectorizer.my_vectorizer', (string) $arguments[1]); @@ -5130,7 +5130,7 @@ public function testIndexerWithStringSource() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'source' => 'https://example.com/feed.xml', 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', @@ -5157,7 +5157,7 @@ public function testIndexerWithArraySource() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'source' => [ '/path/to/file1.txt', '/path/to/file2.txt', @@ -5194,7 +5194,7 @@ public function testIndexerWithNullSource() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', // source not configured, should default to null @@ -5221,7 +5221,7 @@ public function testIndexerWithConfiguredTransformers() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'transformers' => [ TextTrimTransformer::class, 'App\CustomTransformer', @@ -5259,7 +5259,7 @@ public function testIndexerWithEmptyTransformers() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'transformers' => [], 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', @@ -5287,7 +5287,7 @@ public function testIndexerWithoutTransformers() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', // transformers not configured, should default to empty array @@ -5315,7 +5315,7 @@ public function testIndexerWithSourceAndTransformers() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'source' => [ '/path/to/file1.txt', '/path/to/file2.txt', @@ -5335,7 +5335,7 @@ public function testIndexerWithSourceAndTransformers() $arguments = $indexerDefinition->getArguments(); $this->assertInstanceOf(Reference::class, $arguments[0]); - $this->assertSame(InMemoryLoader::class, (string) $arguments[0]); + $this->assertSame(DocumentCollectionLoader::class, (string) $arguments[0]); $this->assertInstanceOf(Reference::class, $arguments[1]); $this->assertSame('my_vectorizer_service', (string) $arguments[1]); @@ -5368,7 +5368,7 @@ public function testIndexerWithConfiguredFilters() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'filters' => [ TextContainsFilter::class, 'App\CustomFilter', @@ -5409,7 +5409,7 @@ public function testIndexerWithEmptyFilters() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'filters' => [], 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', @@ -5436,7 +5436,7 @@ public function testIndexerWithoutFilters() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', // filters not configured, should default to empty array @@ -5463,7 +5463,7 @@ public function testIndexerWithFiltersAndTransformers() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'filters' => [ TextContainsFilter::class, ], @@ -5505,7 +5505,7 @@ public function testIndexerWithSourceFiltersAndTransformers() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'source' => [ '/path/to/file1.txt', '/path/to/file2.txt', @@ -5529,7 +5529,7 @@ public function testIndexerWithSourceFiltersAndTransformers() // Verify correct order: loader, vectorizer, store, source, filters, transformers, logger $this->assertInstanceOf(Reference::class, $arguments[0]); // loader - $this->assertSame(InMemoryLoader::class, (string) $arguments[0]); + $this->assertSame(DocumentCollectionLoader::class, (string) $arguments[0]); $this->assertInstanceOf(Reference::class, $arguments[1]); // vectorizer $this->assertSame('my_vectorizer_service', (string) $arguments[1]); @@ -5566,13 +5566,13 @@ public function testInjectionIndexerAliasIsRegistered() ], 'indexer' => [ 'my_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'transformers' => [], 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', ], 'another' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'transformers' => [], 'vectorizer' => 'my_vectorizer_service', 'store' => 'ai.store.memory.my_store', @@ -7564,7 +7564,7 @@ private function getFullConfig(): array ], 'indexer' => [ 'my_text_indexer' => [ - 'loader' => InMemoryLoader::class, + 'loader' => DocumentCollectionLoader::class, 'vectorizer' => 'ai.vectorizer.test_vectorizer', 'store' => 'my_azuresearch_store_service_id', ], diff --git a/src/store/src/Document/Loader.php b/src/store/src/Document/Loader.php new file mode 100644 index 000000000..c4c1a547a --- /dev/null +++ b/src/store/src/Document/Loader.php @@ -0,0 +1,46 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document; + +use Symfony\AI\Store\Exception\RuntimeException; + +/** + * @author Christopher Hertel + */ +final class Loader +{ + /** + * @param iterable, SourceLoaderInterface> $sourceLoaders + */ + public function __construct( + private readonly iterable $sourceLoaders, + ) { + } + + /** + * @return iterable + */ + public function load(iterable $sources): iterable + { + foreach ($sources as $source) { + if (!$source instanceof SourceInterface) { + throw new RuntimeException(\sprintf('Source must implement "%s", "%s" given.', SourceInterface::class, $source::class)); + } + + if (!isset($this->sourceLoaders[$source::class])) { + throw new RuntimeException(\sprintf('No loader registered for source of type "%s".', $source::class)); + } + + yield from $this->sourceLoaders[$source::class]->load($source); + } + } +} diff --git a/src/store/src/Document/Loader/DocumentCollectionLoader.php b/src/store/src/Document/Loader/DocumentCollectionLoader.php new file mode 100644 index 000000000..4b125570b --- /dev/null +++ b/src/store/src/Document/Loader/DocumentCollectionLoader.php @@ -0,0 +1,44 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Loader; + +use Symfony\AI\Store\Document\Source\DocumentCollection; +use Symfony\AI\Store\Document\SourceInterface; +use Symfony\AI\Store\Document\SourceLoaderInterface; + +/** + * Loader that returns preloaded documents from memory. + * Useful for testing or when documents are already available as objects. + * + * @author Oskar Stark + */ +final class DocumentCollectionLoader implements SourceLoaderInterface +{ + public static function createSource(array|string $source): iterable + { + if (!\is_array($source)) { + throw new \InvalidArgumentException('Source must be an array of EmbeddableDocumentInterface instances.'); + } + + yield new DocumentCollection($source); + } + + public static function supportedSource(): string + { + return DocumentCollection::class; + } + + public function load(SourceInterface|DocumentCollection $source, array $options = []): iterable + { + yield from $source->getDocuments(); + } +} diff --git a/src/store/src/Document/Loader/InMemoryLoader.php b/src/store/src/Document/Loader/InMemoryLoader.php deleted file mode 100644 index 81525c426..000000000 --- a/src/store/src/Document/Loader/InMemoryLoader.php +++ /dev/null @@ -1,37 +0,0 @@ - - * - * For the full copyright and license information, please view the LICENSE - * file that was distributed with this source code. - */ - -namespace Symfony\AI\Store\Document\Loader; - -use Symfony\AI\Store\Document\EmbeddableDocumentInterface; -use Symfony\AI\Store\Document\LoaderInterface; - -/** - * Loader that returns preloaded documents from memory. - * Useful for testing or when documents are already available as objects. - * - * @author Oskar Stark - */ -final class InMemoryLoader implements LoaderInterface -{ - /** - * @param EmbeddableDocumentInterface[] $documents - */ - public function __construct( - private readonly array $documents = [], - ) { - } - - public function load(?string $source = null, array $options = []): iterable - { - yield from $this->documents; - } -} diff --git a/src/store/src/Document/Loader/RssFeedLoader.php b/src/store/src/Document/Loader/RssFeedLoader.php index 927e47287..5bf188c02 100644 --- a/src/store/src/Document/Loader/RssFeedLoader.php +++ b/src/store/src/Document/Loader/RssFeedLoader.php @@ -12,8 +12,10 @@ namespace Symfony\AI\Store\Document\Loader; use Symfony\AI\Store\Document\Loader\Rss\RssItem; -use Symfony\AI\Store\Document\LoaderInterface; use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\Source\RssFeed; +use Symfony\AI\Store\Document\SourceInterface; +use Symfony\AI\Store\Document\SourceLoaderInterface; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Exception\InvalidArgumentException; use Symfony\AI\Store\Exception\RuntimeException; @@ -25,7 +27,7 @@ /** * @author Niklas Grießer */ -final class RssFeedLoader implements LoaderInterface +final class RssFeedLoader implements SourceLoaderInterface { public const OPTION_UUID_NAMESPACE = 'uuid_namespace'; @@ -38,10 +40,33 @@ public function __construct( ) { } + /** + * @return iterable + */ + public static function createSource(string|array $source): iterable + { + foreach ((array) $source as $url) { + if (!\is_string($url)) { + throw new InvalidArgumentException(\sprintf('"%s" requires a string or an array of strings as source, "%s" given.', self::class, get_debug_type($url))); + } + + if (!filter_var($url, \FILTER_VALIDATE_URL)) { + throw new InvalidArgumentException(\sprintf('"%s" is not a valid URL.', $url)); + } + + yield new RssFeed($url); + } + } + + public static function supportedSource(): string + { + return RssFeed::class; + } + /** * @param array{uuid_namespace?: string} $options */ - public function load(?string $source = null, array $options = []): iterable + public function load(RssFeed|SourceInterface $source, array $options = []): iterable { if (!class_exists(Crawler::class)) { throw new RuntimeException('For using the RSS loader, the Symfony DomCrawler component is required. Try running "composer require symfony/dom-crawler".'); @@ -54,7 +79,7 @@ public function load(?string $source = null, array $options = []): iterable $uuidNamespace = Uuid::fromString($options[self::OPTION_UUID_NAMESPACE] ?? $this->uuidNamespace); try { - $xml = $this->httpClient->request('GET', $source, [ + $xml = $this->httpClient->request('GET', $source->getUrl(), [ 'headers' => [ 'Accept' => 'application/rss+xml,application/xml,text/xml', ], diff --git a/src/store/src/Document/Loader/TextFileLoader.php b/src/store/src/Document/Loader/TextFileLoader.php index 18cec8cb5..f2fba2cb8 100644 --- a/src/store/src/Document/Loader/TextFileLoader.php +++ b/src/store/src/Document/Loader/TextFileLoader.php @@ -11,8 +11,10 @@ namespace Symfony\AI\Store\Document\Loader; -use Symfony\AI\Store\Document\LoaderInterface; use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\Source\TextFile; +use Symfony\AI\Store\Document\SourceInterface; +use Symfony\AI\Store\Document\SourceLoaderInterface; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Exception\InvalidArgumentException; use Symfony\AI\Store\Exception\RuntimeException; @@ -21,19 +23,31 @@ /** * @author Christopher Hertel */ -final class TextFileLoader implements LoaderInterface +final class TextFileLoader implements SourceLoaderInterface { - public function load(?string $source = null, array $options = []): iterable + public static function createSource(string|array $source): iterable { - if (null === $source) { - throw new InvalidArgumentException('TextFileLoader requires a file path as source, null given.'); - } + foreach ((array) $source as $filePath) { + if (!\is_string($filePath)) { + throw new InvalidArgumentException(\sprintf('"%s" requires a string or an array of strings as source, "%s" given.', self::class, get_debug_type($url))); + } + + if (!is_file($filePath)) { + throw new InvalidArgumentException(\sprintf('File "%s" does not exist.', $filePath)); + } - if (!is_file($source)) { - throw new RuntimeException(\sprintf('File "%s" does not exist.', $source)); + yield new TextFile($filePath); } + } - $content = file_get_contents($source); + public static function supportedSource(): string + { + return TextFile::class; + } + + public function load(SourceInterface|TextFile $source, array $options = []): iterable + { + $content = file_get_contents($source->getFilePath()); if (false === $content) { throw new RuntimeException(\sprintf('Unable to read file "%s"', $source)); diff --git a/src/store/src/Document/Source/DocumentCollection.php b/src/store/src/Document/Source/DocumentCollection.php new file mode 100644 index 000000000..844c616f1 --- /dev/null +++ b/src/store/src/Document/Source/DocumentCollection.php @@ -0,0 +1,30 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Source; + +use Symfony\AI\Store\Document\SourceInterface; + +/** + * @author Christopher Hertel + */ +final class DocumentCollection implements SourceInterface +{ + public function __construct( + private readonly array $documents = [], + ) { + } + + public function getDocuments(): array + { + return $this->documents; + } +} diff --git a/src/store/src/Document/Source/RssFeed.php b/src/store/src/Document/Source/RssFeed.php new file mode 100644 index 000000000..bb7c3b246 --- /dev/null +++ b/src/store/src/Document/Source/RssFeed.php @@ -0,0 +1,30 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Source; + +use Symfony\AI\Store\Document\SourceInterface; + +/** + * @author Christopher Hertel + */ +final class RssFeed implements SourceInterface +{ + public function __construct( + private readonly string $url, + ) { + } + + public function getUrl(): string + { + return $this->url; + } +} diff --git a/src/store/src/Document/Source/TextFile.php b/src/store/src/Document/Source/TextFile.php new file mode 100644 index 000000000..fb809e1aa --- /dev/null +++ b/src/store/src/Document/Source/TextFile.php @@ -0,0 +1,30 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document\Source; + +use Symfony\AI\Store\Document\SourceInterface; + +/** + * @author Christopher Hertel + */ +final class TextFile implements SourceInterface +{ + public function __construct( + private readonly string $filePath, + ) { + } + + public function getFilePath(): string + { + return $this->filePath; + } +} diff --git a/src/store/src/Document/SourceInterface.php b/src/store/src/Document/SourceInterface.php new file mode 100644 index 000000000..e7978e263 --- /dev/null +++ b/src/store/src/Document/SourceInterface.php @@ -0,0 +1,19 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Document; + +/** + * @author Christopher Hertel + */ +interface SourceInterface +{ +} diff --git a/src/store/src/Document/LoaderInterface.php b/src/store/src/Document/SourceLoaderInterface.php similarity index 53% rename from src/store/src/Document/LoaderInterface.php rename to src/store/src/Document/SourceLoaderInterface.php index 97de1f2ca..7905c47f8 100644 --- a/src/store/src/Document/LoaderInterface.php +++ b/src/store/src/Document/SourceLoaderInterface.php @@ -14,13 +14,25 @@ /** * @author Christopher Hertel */ -interface LoaderInterface +interface SourceLoaderInterface { /** - * @param string|null $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL. Can be null for InMemoryLoader. + * @param string|array $source + * + * @return iterable + */ + public static function createSource(string|array $source): iterable; + + /** + * @return class-string + */ + public static function supportedSource(): string; + + /** + * @param SourceInterface $source Source descriptor instance to load the documents from * @param array $options loader specific set of options to control the loading process * * @return iterable iterable of embeddable documents loaded from the source */ - public function load(?string $source = null, array $options = []): iterable; + public function load(SourceInterface $source, array $options = []): iterable; } diff --git a/src/store/src/Indexer.php b/src/store/src/Indexer.php index d797c05f8..47cb16ece 100644 --- a/src/store/src/Indexer.php +++ b/src/store/src/Indexer.php @@ -13,9 +13,9 @@ use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; -use Symfony\AI\Store\Document\EmbeddableDocumentInterface; use Symfony\AI\Store\Document\FilterInterface; -use Symfony\AI\Store\Document\LoaderInterface; +use Symfony\AI\Store\Document\Loader; +use Symfony\AI\Store\Document\SourceInterface; use Symfony\AI\Store\Document\TransformerInterface; use Symfony\AI\Store\Document\VectorizerInterface; @@ -26,57 +26,37 @@ class Indexer implements IndexerInterface { /** - * @var array - */ - private array $sources = []; - - /** - * @param string|array|null $source Source identifier(s) for data loading (file paths, URLs, etc.) - * @param FilterInterface[] $filters Filters to apply after loading documents to remove unwanted content - * @param TransformerInterface[] $transformers Transformers to mutate documents after filtering (chunking, cleaning, etc.) + * @param FilterInterface[] $filters Filters to apply after loading documents to remove unwanted content + * @param TransformerInterface[] $transformers Transformers to mutate documents after filtering (chunking, cleaning, etc.) */ public function __construct( - private LoaderInterface $loader, + private Loader $loader, private VectorizerInterface $vectorizer, private StoreInterface $store, - string|array|null $source = null, private array $filters = [], private array $transformers = [], private LoggerInterface $logger = new NullLogger(), ) { - $this->sources = null === $source ? [] : (array) $source; - } - - public function withSource(string|array $source): self - { - return new self($this->loader, $this->vectorizer, $this->store, $source, $this->filters, $this->transformers, $this->logger); } - public function index(array $options = []): void + public function index(iterable|SourceInterface $sources, array $options = []): void { - $this->logger->debug('Starting document processing', ['sources' => $this->sources, 'options' => $options]); + $this->logger->debug('Starting document processing', ['source' => $sources, 'options' => $options]); - $documents = []; - if ([] === $this->sources) { - $documents = $this->loadSource(null); - } else { - foreach ($this->sources as $singleSource) { - $documents = array_merge($documents, $this->loadSource($singleSource)); - } - } + $documents = $this->loader->load(is_iterable($sources) ? $sources : [$sources]); if ([] === $documents) { - $this->logger->debug('No documents to process', ['sources' => $this->sources]); + $this->logger->debug('No documents to process', ['sources' => $sources]); return; } foreach ($this->filters as $filter) { - $documents = $filter->filter($documents); + $documents = $filter->filter($documents, $options); } foreach ($this->transformers as $transformer) { - $documents = $transformer->transform($documents); + $documents = $transformer->transform($documents, $options); } $chunkSize = $options['chunk_size'] ?? 50; @@ -87,28 +67,15 @@ public function index(array $options = []): void ++$counter; if ($chunkSize === \count($chunk)) { - $this->store->add(...$this->vectorizer->vectorize($chunk)); + $this->store->add(...$this->vectorizer->vectorize($chunk, $options)); $chunk = []; } } if ([] !== $chunk) { - $this->store->add(...$this->vectorizer->vectorize($chunk)); + $this->store->add(...$this->vectorizer->vectorize($chunk, $options)); } $this->logger->debug('Document processing completed', ['total_documents' => $counter]); } - - /** - * @return EmbeddableDocumentInterface[] - */ - private function loadSource(?string $source): array - { - $documents = []; - foreach ($this->loader->load($source) as $document) { - $documents[] = $document; - } - - return $documents; - } } diff --git a/src/store/src/IndexerInterface.php b/src/store/src/IndexerInterface.php index fcb27494f..b7df2169e 100644 --- a/src/store/src/IndexerInterface.php +++ b/src/store/src/IndexerInterface.php @@ -11,6 +11,8 @@ namespace Symfony\AI\Store; +use Symfony\AI\Store\Document\SourceInterface; + /** * Handles the complete document processing pipeline: load → transform → vectorize → store. * @@ -21,14 +23,8 @@ interface IndexerInterface /** * Process sources through the complete document pipeline: load → transform → vectorize → store. * - * @param array{chunk_size?: int} $options Processing options - */ - public function index(array $options = []): void; - - /** - * Create a new instance with a different source. - * - * @param string|array $source Source identifier (file path, URL, etc.) or array of sources + * @param SourceInterface|SourceInterface[] $sources Document sources to process + * @param array{chunk_size?: int} $options Processing options */ - public function withSource(string|array $source): self; + public function index(array|SourceInterface $sources, array $options = []): void; } diff --git a/src/store/tests/Document/Loader/InMemoryLoaderTest.php b/src/store/tests/Document/Loader/InMemoryLoaderTest.php index fc5ca0b50..1d42d3813 100644 --- a/src/store/tests/Document/Loader/InMemoryLoaderTest.php +++ b/src/store/tests/Document/Loader/InMemoryLoaderTest.php @@ -12,7 +12,7 @@ namespace Symfony\AI\Store\Tests\Document\Loader; use PHPUnit\Framework\TestCase; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\Component\Uid\Uuid; @@ -24,7 +24,7 @@ final class InMemoryLoaderTest extends TestCase { public function testLoadWithEmptyDocuments() { - $loader = new InMemoryLoader(); + $loader = new DocumentCollectionLoader(); $documents = iterator_to_array($loader->load(null)); $this->assertSame([], $documents); @@ -33,7 +33,7 @@ public function testLoadWithEmptyDocuments() public function testLoadWithSingleDocument() { $document = new TextDocument(Uuid::v4(), 'This is test content'); - $loader = new InMemoryLoader([$document]); + $loader = new DocumentCollectionLoader([$document]); $documents = iterator_to_array($loader->load(null)); @@ -46,7 +46,7 @@ public function testLoadWithMultipleDocuments() { $document1 = new TextDocument(Uuid::v4(), 'First document'); $document2 = new TextDocument(Uuid::v4(), 'Second document', new Metadata(['type' => 'test'])); - $loader = new InMemoryLoader([$document1, $document2]); + $loader = new DocumentCollectionLoader([$document1, $document2]); $documents = iterator_to_array($loader->load(null)); @@ -61,7 +61,7 @@ public function testLoadWithMultipleDocuments() public function testLoadIgnoresSourceParameter() { $document = new TextDocument(Uuid::v4(), 'Test content'); - $loader = new InMemoryLoader([$document]); + $loader = new DocumentCollectionLoader([$document]); // Source parameter should be ignored - same result regardless of value $documentsWithNull = iterator_to_array($loader->load(null)); diff --git a/src/store/tests/IndexerTest.php b/src/store/tests/IndexerTest.php index 6ee808383..16634de26 100644 --- a/src/store/tests/IndexerTest.php +++ b/src/store/tests/IndexerTest.php @@ -16,7 +16,7 @@ use Symfony\AI\Platform\Vector\Vector; use Symfony\AI\Store\Document\Filter\TextContainsFilter; use Symfony\AI\Store\Document\FilterInterface; -use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Loader\DocumentCollectionLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\TransformerInterface; @@ -33,7 +33,7 @@ public function testIndexSingleDocument() { $document = new TextDocument($id = Uuid::v4(), 'Test content'); $vector = new Vector([0.1, 0.2, 0.3]); - $loader = new InMemoryLoader([$document]); + $loader = new DocumentCollectionLoader([$document]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); @@ -47,7 +47,7 @@ public function testIndexSingleDocument() public function testIndexEmptyDocumentList() { - $loader = new InMemoryLoader([]); + $loader = new DocumentCollectionLoader([]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); @@ -61,7 +61,7 @@ public function testIndexDocumentWithMetadata() $metadata = new Metadata(['key' => 'value']); $document = new TextDocument($id = Uuid::v4(), 'Test content', $metadata); $vector = new Vector([0.1, 0.2, 0.3]); - $loader = new InMemoryLoader([$document]); + $loader = new DocumentCollectionLoader([$document]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore()); @@ -81,7 +81,7 @@ public function testWithSource() $vector = new Vector([0.1, 0.2, 0.3]); // InMemoryLoader doesn't use source parameter, so we'll test withSource method's immutability - $loader = new InMemoryLoader([$document1]); + $loader = new DocumentCollectionLoader([$document1]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), 'source1'); @@ -112,7 +112,7 @@ public function testWithSourceArray() $vector6 = new Vector([1.6, 1.7, 1.8]); // InMemoryLoader returns all documents regardless of source - $loader = new InMemoryLoader([$document1, $document2]); + $loader = new DocumentCollectionLoader([$document1, $document2]); // Need 6 vectors total: 2 for first indexer, then 2 for each source in the second indexer (2 sources * 2 docs = 4) $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector1, $vector2, $vector3, $vector4, $vector5, $vector6)), 'test-embedding-model'); @@ -146,7 +146,7 @@ public function testIndexWithTextContainsFilter() // Filter will remove the "Week of Symfony" document, leaving 2 documents $vector1 = new Vector([0.1, 0.2, 0.3]); $vector2 = new Vector([0.4, 0.5, 0.6]); - $loader = new InMemoryLoader($documents); + $loader = new DocumentCollectionLoader($documents); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector1, $vector2)), 'test-embedding-model'); $filter = new TextContainsFilter('Week of Symfony'); @@ -168,7 +168,7 @@ public function testIndexWithMultipleFilters() // Filters will remove "Week of Symfony" and "SPAM" documents, leaving 2 documents $vector1 = new Vector([0.1, 0.2, 0.3]); $vector2 = new Vector([0.4, 0.5, 0.6]); - $loader = new InMemoryLoader($documents); + $loader = new DocumentCollectionLoader($documents); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector1, $vector2)), 'test-embedding-model'); $filters = [ new TextContainsFilter('Week of Symfony'), @@ -192,7 +192,7 @@ public function testIndexWithFiltersAndTransformers() // Filter will remove "Week of Symfony" document, leaving 2 documents $vector1 = new Vector([0.1, 0.2, 0.3]); $vector2 = new Vector([0.4, 0.5, 0.6]); - $loader = new InMemoryLoader($documents); + $loader = new DocumentCollectionLoader($documents); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector1, $vector2)), 'test-embedding-model'); $filter = new TextContainsFilter('Week of Symfony'); $transformer = new class implements TransformerInterface { @@ -228,7 +228,7 @@ public function testIndexWithFiltersAndTransformersAppliesBoth() // Filter will remove the "Remove" document, leaving 2 documents $vector1 = new Vector([0.1, 0.2, 0.3]); $vector2 = new Vector([0.4, 0.5, 0.6]); - $loader = new InMemoryLoader($documents); + $loader = new DocumentCollectionLoader($documents); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector1, $vector2)), 'test-embedding-model'); $filter = new class implements FilterInterface { @@ -269,7 +269,7 @@ public function testIndexWithNoFilters() { $document = new TextDocument(Uuid::v4(), 'Test content'); $vector = new Vector([0.1, 0.2, 0.3]); - $loader = new InMemoryLoader([$document]); + $loader = new DocumentCollectionLoader([$document]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); $indexer = new Indexer($loader, $vectorizer, $store = new TestStore(), null, []); @@ -282,7 +282,7 @@ public function testWithSourcePreservesFilters() { $document = new TextDocument(Uuid::v4(), 'Test content'); $vector = new Vector([0.1, 0.2, 0.3]); - $loader = new InMemoryLoader([$document]); + $loader = new DocumentCollectionLoader([$document]); $vectorizer = new Vectorizer(PlatformTestHandler::createPlatform(new VectorResult($vector)), 'text-embedding-3-small'); $filter = new TextContainsFilter('nonexistent');