From 0d4958229ee518ef9d6a6461cb2e1627080739c3 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 10 Mar 2026 09:39:36 -0700 Subject: [PATCH 01/59] [A] Bump service lambda version --- lambdas/service/app.py | 2 +- lambdas/service/openapi.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lambdas/service/app.py b/lambdas/service/app.py index e4728bd61a..a56d4af63d 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -55,7 +55,7 @@ # changes and reset the minor version to zero. Otherwise, increment only # the minor version for backwards compatible changes. A backwards # compatible change is one that does not require updates to clients. - 'version': '16.1', + 'version': '17.0', 'description': fd(f''' # Overview diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index a0cc93b94c..916f2ec32f 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -2,7 +2,7 @@ "openapi": "3.0.1", "info": { "title": "azul-service-dev", - "version": "16.1", + "version": "17.0", "description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n\n\n## Contact us\n\nFor technical support please file an issue at\n[GitHub](https://github.com/DataBiosphere/azul/issues) or email\n`azul-group@ucsc.edu`. To report a security concern or misconduct please email\n`azul-group@ucsc.edu`.\n" }, "tags": [ From 873e6756e1fd93367a7a5c1abb49a1b419a11c8e Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Mon, 5 Jan 2026 15:08:45 -0800 Subject: [PATCH 02/59] [A] Remove sort & filter key sampleId from HCA (#6793) --- lambdas/service/openapi.json | 136 ++-------------------- src/azul/plugins/metadata/hca/__init__.py | 3 +- test/service/test_response.py | 4 +- 3 files changed, 10 insertions(+), 133 deletions(-) diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index 916f2ec32f..a48212a36e 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -2483,23 +2483,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -2778,7 +2761,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -2867,7 +2850,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -4367,23 +4349,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -4662,7 +4627,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -4751,7 +4716,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -6251,23 +6215,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -6546,7 +6493,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -6635,7 +6582,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -8025,23 +7971,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -8320,7 +8249,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] }, @@ -9713,23 +9642,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -10008,7 +9920,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] } @@ -11359,23 +11271,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -11654,7 +11549,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", @@ -13110,23 +13005,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -13405,7 +13283,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 7f554f351f..63c2ab2c73 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -186,7 +186,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]: files=Sorting(field_name='fileName'), projects=Sorting(field_name='projectTitle', max_page_size=75), - samples=Sorting(field_name='sampleId') + samples=Sorting(field_name='entryId') ) @property @@ -279,7 +279,6 @@ def _field_mapping(self) -> InverseFieldMapping: 'donor_count': 'donorCount' }, 'samples': { - 'biomaterial_id': 'sampleId', 'entity_type': 'sampleEntityType', 'organ': 'organ', 'organ_part': 'organPart', diff --git a/test/service/test_response.py b/test/service/test_response.py index 9ef07e8adb..310e6d737b 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -2087,7 +2087,7 @@ def test_bad_search_after_search_before(self): """ Test that invalid JSON for search_after or search_before raise a 400 """ - query_params = self._params(size=1, sort='sampleId', order='asc') + query_params = self._params(size=1, sort='entryId', order='asc') url = self.base_url.set(path='/index/samples', args=query_params) # Get page 1 response = requests.get(str(url)) @@ -3724,7 +3724,7 @@ def test(self): 'default_order': 'asc' }, 'samples': { - 'default_sort': 'sampleId', + 'default_sort': 'entryId', 'default_order': 'asc' } } From 3c59cef3ff75104ecff903258b9f5f7190e3b897 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Fri, 9 Jan 2026 14:43:37 -0800 Subject: [PATCH 03/59] Document need for aggregation of projects.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index f92cc5b169..b17f1de405 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -191,6 +191,12 @@ class ProjectAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: if field == 'document_id': + # If any project IDs are missing from the aggregate, those projects + # will be omitted during the verbatim handover. Projects are a "hot" + # entity type, and we can't track their hubs in replica documents, + # so we rely on the inner entity IDs instead. We also need to + # aggregate `document_id` to allow filtering by `projectId` on + # non-project endpoints. return SetAccumulator(max_size=100) elif field in ('project_description', 'contact_names', From cbe835f73ae70647d15077ecf95e2df80a5a43e0 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 10:11:37 -0800 Subject: [PATCH 04/59] [r] Conditionally remove aggregation of cell_suspensions.biomaterial_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 9 ++++++++- ...b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json | 9 --------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index b17f1de405..4d094f9fa8 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -143,7 +143,14 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return frozenset(entity['organ']), def _accumulator(self, field) -> Accumulator | None: - if field in self.cell_count_fields: + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + elif field in self.cell_count_fields: return DistinctAccumulator(SumAccumulator()) else: return super()._accumulator(field) diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index a77d93573b..35614c3e69 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1521,9 +1521,6 @@ "document_id": [ "412898c5-5b9b-4907-b07c-e9b89666e204" ], - "biomaterial_id": [ - "GSM2172585 1" - ], "total_estimated_cells_redundant": 0, "total_estimated_cells_redundant_": 0, "total_estimated_cells": 1, @@ -2468,9 +2465,6 @@ "document_id": [ "412898c5-5b9b-4907-b07c-e9b89666e204" ], - "biomaterial_id": [ - "GSM2172585 1" - ], "selected_cell_type": [ "~null" ], @@ -2763,9 +2757,6 @@ "document_id": [ "412898c5-5b9b-4907-b07c-e9b89666e204" ], - "biomaterial_id": [ - "GSM2172585 1" - ], "selected_cell_type": [ "~null" ], From 6876dc991c28ac0fa9e96063981cfefd39017bdf Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Mon, 5 Jan 2026 17:06:58 -0800 Subject: [PATCH 05/59] [r] Conditionally remove aggregation of cell_suspensions.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- ...b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 4d094f9fa8..1831dd4979 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -143,7 +143,7 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return frozenset(entity['organ']), def _accumulator(self, field) -> Accumulator | None: - if field == 'biomaterial_id': + if field in ('biomaterial_id', 'document_id'): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index 35614c3e69..215052289e 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1518,9 +1518,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "total_estimated_cells_redundant": 0, "total_estimated_cells_redundant_": 0, "total_estimated_cells": 1, @@ -2462,9 +2459,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "selected_cell_type": [ "~null" ], @@ -2754,9 +2748,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "selected_cell_type": [ "~null" ], From 729d6bb927425b9ff7d83eee1437dc20b43e133f Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 10:31:12 -0800 Subject: [PATCH 06/59] [r A] Conditionally remove aggregation of cell_lines.biomaterial_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 12 +++++++++++- src/azul/plugins/metadata/hca/service/response.py | 1 - test/service/test_response.py | 4 ---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 1831dd4979..92ddf2f0f2 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -157,7 +157,17 @@ def _accumulator(self, field) -> Accumulator | None: class CellLineAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class DonorOrganismAggregator(SimpleAggregator): diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index 768faebff0..de735a0bd1 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -459,7 +459,6 @@ def make_cell_suspensions(self, entry) -> MutableJSONs: def make_cell_line(self, cell_line) -> MutableJSON: return { - 'id': cell_line['biomaterial_id'], 'cellLineType': cell_line.get('cell_line_type', None), 'modelOrgan': cell_line.get('model_organ', None), } diff --git a/test/service/test_response.py b/test/service/test_response.py index 310e6d737b..a1eac5196a 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -982,7 +982,6 @@ def test_response_stage_projects_cell_line(self): stage = self._response_stage('projects') response = stage.process_response((hits, self.paginations[0], {})) expected_cell_lines = { - 'id': ['cell_line_Day7_hiPSC-CM_BioRep2', 'cell_line_GM18517'], 'cellLineType': ['primary', 'stem cell-derived'], 'modelOrgan': ['blood (parent_cell_line)', 'blood (child_cell_line)'], } @@ -992,7 +991,6 @@ def test_response_stage_projects_cell_line(self): expected_samples = { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['blood (child_cell_line)'], - 'id': ['cell_line_Day7_hiPSC-CM_BioRep2'], 'cellLineType': ['stem cell-derived'], 'modelOrgan': ['blood (child_cell_line)'], } @@ -2392,7 +2390,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['immune system'], - 'id': ['Cell_line_2'], 'cellLineType': ['primary'], 'modelOrgan': ['immune system'], }, @@ -2432,7 +2429,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['immune system'], - 'id': ['Cell_line_2'], 'cellLineType': ['primary'], 'modelOrgan': ['immune system'], }, From e7e4d9d394ab96f64d78f64c0635d53f37ab49d9 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 08:48:13 -0800 Subject: [PATCH 07/59] [r] Conditionally remove aggregation of cell_lines.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- test/indexer/test_indexer.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 92ddf2f0f2..b9c7d93bf0 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -159,7 +159,7 @@ def _accumulator(self, field) -> Accumulator | None: class CellLineAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'biomaterial_id': + if field == ('biomaterial_id', 'document_id'): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': diff --git a/test/indexer/test_indexer.py b/test/indexer/test_indexer.py index 2669962f9b..d51d26a411 100644 --- a/test/indexer/test_indexer.py +++ b/test/indexer/test_indexer.py @@ -1932,15 +1932,13 @@ def test_cell_line_sample(self): if qualifier == 'samples': sample = one(contents['samples']) sample_entity_type = sample['entity_type'] - if aggregate: - document_ids = one(contents[sample_entity_type])['document_id'] - elif contribution: + if contribution: document_ids = [d['document_id'] for d in contents[sample_entity_type]] + self.assertIn(sample['document_id'], document_ids) entity = one(d for d in contents[sample_entity_type] if d['document_id'] == sample['document_id']) self.assertEqual(sample['biomaterial_id'], entity['biomaterial_id']) else: - assert False, doc_type - self.assertIn(sample['document_id'], document_ids) + assert aggregate, doc_type self.assertEqual(one(contents['specimens'])['organ'], ['blood'] if aggregate else 'blood') self.assertEqual(one(contents['specimens'])['organ_part'], ['venous blood']) self.assertEqual(len(contents['cell_lines']), 1 if aggregate else 2) From 6f99b22fcd2b61b5bb7f505b27e1296eda3b0776 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 10:43:41 -0800 Subject: [PATCH 08/59] [r A] Conditionally remove aggregation of donors.biomaterial_id in HCA (#6793) --- .../plugins/metadata/hca/indexer/aggregate.py | 9 ++++++++- src/azul/plugins/metadata/hca/service/response.py | 1 - ...d4d9d.2018-11-02T11:33:44.698028Z.results.json | 12 ------------ test/indexer/test_indexer.py | 1 + test/service/test_app_logging.py | 2 +- test/service/test_response.py | 15 --------------- 6 files changed, 10 insertions(+), 30 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index b9c7d93bf0..a24bab0be1 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -179,7 +179,14 @@ def _transform_entity(self, entity: JSON) -> JSON: } def _accumulator(self, field) -> Accumulator | None: - if field == 'organism_age_range': + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + elif field == 'organism_age_range': return SetAccumulator(max_size=100) elif field == 'organism_age': return SetOfDictAccumulator(max_size=100, diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index de735a0bd1..da87aa10ad 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -468,7 +468,6 @@ def make_cell_lines(self, entry) -> MutableJSONs: def make_donor(self, donor) -> MutableJSON: return { - 'id': donor['biomaterial_id'], 'donorCount': donor.get('donor_count', None), 'developmentStage': donor.get('development_stage', None), 'genusSpecies': donor.get('genus_species', None), diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index 215052289e..a513fd7a8e 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1539,9 +1539,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "biological_sex": [ @@ -2429,9 +2426,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "genus_species": [ @@ -2718,9 +2712,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "genus_species": [ @@ -3329,9 +3320,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "biological_sex": [ diff --git a/test/indexer/test_indexer.py b/test/indexer/test_indexer.py index d51d26a411..f18912d5cf 100644 --- a/test/indexer/test_indexer.py +++ b/test/indexer/test_indexer.py @@ -2025,6 +2025,7 @@ def test_sample_with_no_donor(self): k: (v if isinstance(v, list) else [v]) + ([] if k == 'organism_age_range' or True else [None]) for k, v in donor.items() + if k != 'biomaterial_id' } } hits = self._get_all_hits() diff --git a/test/service/test_app_logging.py b/test/service/test_app_logging.py index 15c2aab84d..9df45d9ccb 100644 --- a/test/service/test_app_logging.py +++ b/test/service/test_app_logging.py @@ -153,7 +153,7 @@ def filter_body(organ: str) -> JSON: elif debug == 1: expected_log = f'… with a response body starting in {body[:prefix_len]}' elif debug > 1: - expected_log = f'… with a response body of length 9137 being {body}' + expected_log = f'… with a response body of length 9114 being {body}' else: assert False self.assertEqual(expected_log, body_log_message) diff --git a/test/service/test_response.py b/test/service/test_response.py index a1eac5196a..6cfd29d25b 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -269,7 +269,6 @@ def test_response_stage_files(self): 'disease': ['normal'], 'developmentStage': [None], 'genusSpecies': ['Australopithecus'], - 'id': ['DID_scRSq06'], 'donorCount': 1, 'organismAge': [{'value': '38', 'unit': 'year'}], 'organismAgeRange': [[1198368000.0, 1198368000.0]], @@ -537,7 +536,6 @@ def test_response_stage_projects(self): 'disease': ['normal'], 'developmentStage': [None], 'genusSpecies': ['Australopithecus'], - 'id': ['DID_scRSq06'], 'donorCount': 1, 'organismAge': [{'value': '38', 'unit': 'year'}], 'organismAgeRange': [[1198368000.0, 1198368000.0]], @@ -768,7 +766,6 @@ def test_response_stage_projects_accessions(self): 'disease': ['H syndrome'], 'developmentStage': ['human adult stage'], 'genusSpecies': ['Homo sapiens'], - 'id': ['donor_ID_1'], 'donorCount': 1, 'organismAge': [{'value': '20', 'unit': 'year'}], 'organismAgeRange': [[630720000.0, 630720000.0]], @@ -1223,12 +1220,6 @@ def test_ranged_values(self): 'genusSpecies': [ 'Homo sapiens' ], - 'id': [ - 'HPSI0314i-hoik', - 'HPSI0214i-wibj', - 'HPSI0314i-sojd', - 'HPSI0214i-kucg' - ], 'donorCount': 4, 'organismAge': [ {'value': '45-49', 'unit': 'year'}, @@ -1251,12 +1242,6 @@ def test_ranged_values(self): 'genusSpecies': [ 'Homo sapiens' ], - 'id': [ - 'HPSI0314i-hoik', - 'HPSI0214i-wibj', - 'HPSI0314i-sojd', - 'HPSI0214i-kucg' - ], 'donorCount': 4, 'organismAge': [ {'value': '40-44', 'unit': 'year'}, From 18bf9eb1e5bc89ef9b909c480c90f4403b40479f Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 10:58:01 -0800 Subject: [PATCH 09/59] [r A] Conditionally remove aggregation of organoids.biomaterial_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 12 +++++++++++- src/azul/plugins/metadata/hca/service/response.py | 1 - test/service/test_response.py | 6 ------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index a24bab0be1..44cbc425ef 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -208,7 +208,17 @@ def _accumulator(self, field) -> Accumulator | None: class OrganoidAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class ProjectAggregator(SimpleAggregator): diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index da87aa10ad..e0ffb3c01f 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -482,7 +482,6 @@ def make_donors(self, entry) -> MutableJSONs: def make_organoid(self, organoid) -> MutableJSON: return { - 'id': organoid['biomaterial_id'], 'modelOrgan': organoid.get('model_organ', None), 'modelOrganPart': organoid.get('model_organ_part', None) } diff --git a/test/service/test_response.py b/test/service/test_response.py index 6cfd29d25b..569a16577f 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -2396,12 +2396,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['organoids'], 'effectiveOrgan': ['Brain'], - 'id': [ - 'Org_HPSI0214i-kucg_2_2', - 'Org_HPSI0214i-wibj_2_2', - 'Org_HPSI0314i-hoik_1_2', - 'Org_HPSI0314i-sojd_3_2', - ], 'modelOrgan': ['Brain'], 'modelOrganPart': [None], } From 4bbbbdac6271f519d8e70055f67cff58686854b7 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 08:57:18 -0800 Subject: [PATCH 10/59] [r] Conditionally remove aggregation of organoids.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 44cbc425ef..b4825dbadd 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -210,7 +210,7 @@ def _accumulator(self, field) -> Accumulator | None: class OrganoidAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'biomaterial_id': + if field in ('biomaterial_id', 'document_id'): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': From 0eb96a35eecd8bb34f5e4c238e65fb30474d4197 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 11:07:44 -0800 Subject: [PATCH 11/59] [r] Conditionally remove aggregation of sequencing_inputs.biomaterial_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 12 +++++++++++- ...15ad4d9d.2018-11-02T11:33:44.698028Z.results.json | 12 ------------ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index b4825dbadd..617bb27033 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -271,7 +271,17 @@ def _default_accumulator(self) -> Accumulator | None: class SequencingInputAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class SequencingProcessAggregator(SimpleAggregator): diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index a513fd7a8e..9e80b20b6a 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1464,9 +1464,6 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], "document_id": [ "412898c5-5b9b-4907-b07c-e9b89666e204" ], @@ -2367,9 +2364,6 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], "document_id": [ "412898c5-5b9b-4907-b07c-e9b89666e204" ], @@ -2653,9 +2647,6 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], "document_id": [ "412898c5-5b9b-4907-b07c-e9b89666e204" ], @@ -3243,9 +3234,6 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], "document_id": [ "412898c5-5b9b-4907-b07c-e9b89666e204" ], From 27f6387e42c5c77bc89aaa3dbde6cbe5f0aed3e4 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 09:06:32 -0800 Subject: [PATCH 12/59] [r] Conditionally remove aggregation of sequencing_inputs.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- ...15ad4d9d.2018-11-02T11:33:44.698028Z.results.json | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 617bb27033..e927f06691 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -273,7 +273,7 @@ def _default_accumulator(self) -> Accumulator | None: class SequencingInputAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'biomaterial_id': + if field in ('biomaterial_id', 'document_id'): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index 9e80b20b6a..0dd1ddeae9 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1464,9 +1464,6 @@ ], "sequencing_inputs": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] @@ -2364,9 +2361,6 @@ ], "sequencing_inputs": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] @@ -2647,9 +2641,6 @@ ], "sequencing_inputs": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] @@ -3234,9 +3225,6 @@ ], "sequencing_inputs": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] From 9b6d24d782c4031d46700a5077c83812aef5729a Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 09:16:54 -0800 Subject: [PATCH 13/59] [r] Conditionally remove aggregation of sequencing_processes.document_id in HCA (#6793) --- .../plugins/metadata/hca/indexer/aggregate.py | 11 +++++++++ ...d.2018-11-02T11:33:44.698028Z.results.json | 24 ++++--------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index e927f06691..2b011eba41 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -286,6 +286,17 @@ def _accumulator(self, field) -> Accumulator | None: class SequencingProcessAggregator(SimpleAggregator): + def _accumulator(self, field) -> Accumulator | None: + if field == 'document_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) + def _default_accumulator(self) -> Accumulator | None: return SetAccumulator(max_size=10) diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index 0dd1ddeae9..357bee3d79 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1470,11 +1470,7 @@ } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -2367,11 +2363,7 @@ } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -2647,11 +2639,7 @@ } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -3231,11 +3219,7 @@ } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { From f6ff6b25f1da4cf8fa0d896a6202c6eaea46e5a3 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 09:48:40 -0800 Subject: [PATCH 14/59] [r A] Conditionally remove aggregation of specimens.biomaterial_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 12 +++++++++++- src/azul/plugins/metadata/hca/service/response.py | 1 - ...15ad4d9d.2018-11-02T11:33:44.698028Z.results.json | 12 ------------ test/service/test_app_logging.py | 2 +- test/service/test_response.py | 9 --------- 5 files changed, 12 insertions(+), 24 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 2b011eba41..b85956a106 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -120,7 +120,17 @@ class SampleAggregator(SimpleAggregator): class SpecimenAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class CellSuspensionAggregator(GroupingAggregator): diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index e0ffb3c01f..72debdbea4 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -429,7 +429,6 @@ def make_file(self, source: SourceSpec, file: JSON) -> JSON: def make_specimen(self, specimen) -> MutableJSON: return { - 'id': specimen['biomaterial_id'], 'organ': specimen.get('organ', None), 'organPart': specimen.get('organ_part', None), 'disease': specimen.get('disease', None), diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index 357bee3d79..ae8e1ab38a 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1483,9 +1483,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2376,9 +2373,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2652,9 +2646,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -3232,9 +3223,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], diff --git a/test/service/test_app_logging.py b/test/service/test_app_logging.py index 9df45d9ccb..cacd2f6e1d 100644 --- a/test/service/test_app_logging.py +++ b/test/service/test_app_logging.py @@ -153,7 +153,7 @@ def filter_body(organ: str) -> JSON: elif debug == 1: expected_log = f'… with a response body starting in {body[:prefix_len]}' elif debug > 1: - expected_log = f'… with a response body of length 9114 being {body}' + expected_log = f'… with a response body of length 9050 being {body}' else: assert False self.assertEqual(expected_log, body_log_message) diff --git a/test/service/test_response.py b/test/service/test_response.py index 569a16577f..a4d359c260 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -328,7 +328,6 @@ def test_response_stage_files(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -345,7 +344,6 @@ def test_response_stage_files(self): 'specimens': [ { 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -654,7 +652,6 @@ def test_response_stage_projects(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -671,7 +668,6 @@ def test_response_stage_projects(self): 'specimens': [ { 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -920,7 +916,6 @@ def test_response_stage_projects_accessions(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['brain'], 'disease': ['H syndrome'], - 'id': ['specimen_ID_1'], 'organ': ['brain'], 'organPart': ['amygdala'], 'preservationMethod': [None], @@ -937,7 +932,6 @@ def test_response_stage_projects_accessions(self): 'specimens': [ { 'disease': ['H syndrome'], - 'id': ['specimen_ID_1'], 'organ': ['brain'], 'organPart': ['amygdala'], 'preservationMethod': [None], @@ -2381,7 +2375,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['embryo'], - 'id': ['Specimen1'], 'organ': ['embryo'], 'organPart': ['skin epidermis'], 'disease': ['normal'], @@ -2414,7 +2407,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['embryo'], - 'id': ['Specimen1'], 'organ': ['embryo'], 'organPart': ['skin epidermis'], 'disease': ['normal'], @@ -2426,7 +2418,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'disease': ['normal'], From 361b2e80164188ca538ca84fbabe02c4e5e95037 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Mon, 5 Jan 2026 16:55:18 -0800 Subject: [PATCH 15/59] [r] Conditionally remove aggregation of specimens.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 8 +++++++- ...b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json | 9 --------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index b85956a106..d4c4199e5e 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -122,11 +122,17 @@ class SampleAggregator(SimpleAggregator): class SpecimenAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'biomaterial_id': + if field in ('biomaterial_id', 'document_id'): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': return super()._accumulator(field) + # `document_id` is included in the sample aggregate so that the + # summary response field `specimenCount` can be calculated. This + # should not be a problem since there should only ever be one + # specimen inner entity in a samples outer entity. + elif field == 'document_id' and self.outer_entity_type == 'samples': + return super()._accumulator(field) else: return None else: diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index ae8e1ab38a..7828e587a6 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1480,9 +1480,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "disease": [ "normal" ], @@ -2643,9 +2640,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "disease": [ "normal" ], @@ -3220,9 +3214,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "disease": [ "normal" ], From 6e98dd5d14d621b3c417d4c00475bb4454c31bca Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 11:14:53 -0800 Subject: [PATCH 16/59] [r] Conditionally remove aggregation of samples.biomaterial_id in HCA (#6793) --- .../plugins/metadata/hca/indexer/aggregate.py | 12 +++++++++++- ...9d.2018-11-02T11:33:44.698028Z.results.json | 18 ------------------ 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index d4c4199e5e..df3b93c71e 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -116,7 +116,17 @@ def _default_accumulator(self) -> Accumulator | None: class SampleAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class SpecimenAggregator(SimpleAggregator): diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index 7828e587a6..1f9d1b9b0e 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1405,9 +1405,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -1439,9 +1436,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2565,9 +2559,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -2599,9 +2590,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -3139,9 +3127,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -3173,9 +3158,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], From c627db0f981f770fa465d473810a46c7df45374d Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Mon, 5 Jan 2026 16:23:10 -0800 Subject: [PATCH 17/59] [r] Conditionally remove aggregation of samples.document_id in HCA (#6793) --- .../plugins/metadata/hca/indexer/aggregate.py | 2 +- .../plugins/metadata/hca/service/response.py | 5 +++-- ...9d.2018-11-02T11:33:44.698028Z.results.json | 18 ------------------ 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index df3b93c71e..90fe55e96b 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -118,7 +118,7 @@ def _default_accumulator(self) -> Accumulator | None: class SampleAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'biomaterial_id': + if field in ('biomaterial_id', 'document_id'): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index 72debdbea4..8ae7d64215 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -489,11 +489,12 @@ def make_organoids(self, entry) -> MutableJSONs: return [self.make_organoid(organoid) for organoid in entry['contents']['organoids']] def make_sample(self, sample, entity_dict, entity_type) -> MutableJSON: - is_aggregate = isinstance(sample['document_id'], list) organ_prop = 'organ' if entity_type == 'specimens' else 'model_organ' + effective_organ = sample[organ_prop] + is_aggregate = isinstance(effective_organ, list) return { 'sampleEntityType': [entity_type] if is_aggregate else entity_type, - 'effectiveOrgan': sample[organ_prop], + 'effectiveOrgan': effective_organ, **entity_dict } diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index 1f9d1b9b0e..00970feaa9 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1402,9 +1402,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "entity_type": [ "specimens" ], @@ -1433,9 +1430,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "disease": [ "normal" ], @@ -2556,9 +2550,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "entity_type": [ "specimens" ], @@ -2587,9 +2578,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "disease": [ "normal" ], @@ -3124,9 +3112,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "entity_type": [ "specimens" ], @@ -3155,9 +3140,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], "disease": [ "normal" ], From a3565af9046139ed2d8f083f556fc6d8907d7076 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 16:53:22 -0800 Subject: [PATCH 18/59] [r] Conditionally remove aggregation of activities.activity_id in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 9 ++++++++- ...ea02-e274-affe-aabc-eb3db63ad068.results.json | 16 ---------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index e7deaad221..b23690d27c 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -23,7 +23,14 @@ class ActivityAggregator(SimpleAggregator): - pass + + def _accumulator(self, field: str) -> Accumulator | None: + if field == 'activity_id' and self.outer_entity_type != 'files': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + else: + return super()._accumulator(field) class BiosampleAggregator(SimpleAggregator): diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 30b2d92fed..81fe5c2158 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -931,10 +931,6 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2332,10 +2328,6 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2856,10 +2848,6 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -3391,10 +3379,6 @@ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], From 89d97c7b993c1ab35d5503a62b37012d17fa7c3a Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:08:58 -0800 Subject: [PATCH 19/59] [r] Conditionally remove aggregation of activities.document_id in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 5 ++++- ...ea02-e274-affe-aabc-eb3db63ad068.results.json | 16 ---------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index b23690d27c..4025b4fe53 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -25,7 +25,10 @@ class ActivityAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field == 'activity_id' and self.outer_entity_type != 'files': + if field in { + 'activity_id', + 'document_id' + } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 81fe5c2158..69a68372fa 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -923,10 +923,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], "source_datarepo_row_ids": [ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" @@ -2320,10 +2316,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], "source_datarepo_row_ids": [ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" @@ -2840,10 +2832,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], "source_datarepo_row_ids": [ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" @@ -3371,10 +3359,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], "source_datarepo_row_ids": [ "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" From c68ff9dbf235fefcda3c0ad0594a38718586b8f4 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:13:38 -0800 Subject: [PATCH 20/59] [r] Conditionally remove aggregation of activities.source_datarepo_row_ids in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 3 ++- ...ea02-e274-affe-aabc-eb3db63ad068.results.json | 16 ---------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 4025b4fe53..b446417da5 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -27,7 +27,8 @@ class ActivityAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: if field in { 'activity_id', - 'document_id' + 'document_id', + 'source_datarepo_row_ids' } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 69a68372fa..30fb5ba794 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -923,10 +923,6 @@ "contents": { "activities": [ { - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2316,10 +2312,6 @@ "contents": { "activities": [ { - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2832,10 +2824,6 @@ "contents": { "activities": [ { - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -3359,10 +3347,6 @@ "contents": { "activities": [ { - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], "activity_table": [ "anvil_sequencingactivity" ], From 592670bdb88e9dca50fa688d7944bc883c81f7e7 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:18:40 -0800 Subject: [PATCH 21/59] [r] Conditionally remove aggregation of biosamples.biosample_id in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 6 +++++- ...dea02-e274-affe-aabc-eb3db63ad068.results.json | 15 --------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index b446417da5..37c3cf0950 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -40,7 +40,11 @@ def _accumulator(self, field: str) -> Accumulator | None: class BiosampleAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field == 'donor_age_at_collection': + if field == 'biosample_id' and self.outer_entity_type != 'files': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + elif field == 'donor_age_at_collection': return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), itemgetter('lte', 'gte'))) diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 30fb5ba794..ca4b7f72f1 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -34,9 +34,6 @@ "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -948,9 +945,6 @@ "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -1897,9 +1891,6 @@ "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -2849,9 +2840,6 @@ "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -3372,9 +3360,6 @@ "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], From 904b6fee8fbd8deef9c1a23f421dab7bc989719e Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:24:23 -0800 Subject: [PATCH 22/59] [r] Conditionally remove aggregation of biosamples.document_id in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 5 ++++- ...dea02-e274-affe-aabc-eb3db63ad068.results.json | 15 --------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 37c3cf0950..094c2fd28c 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -40,7 +40,10 @@ def _accumulator(self, field: str) -> Accumulator | None: class BiosampleAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field == 'biosample_id' and self.outer_entity_type != 'files': + if field in { + 'biosample_id', + 'document_id' + } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index ca4b7f72f1..2dccd702c4 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -28,9 +28,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], @@ -939,9 +936,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], @@ -1885,9 +1879,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], @@ -2834,9 +2825,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], @@ -3354,9 +3342,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], "source_datarepo_row_ids": [ "sample:98048c3b-2525-4090-94fd-477de31f2608" ], From 6ba7ddf25cd2ad587df0f0342d1224ce8510b2d4 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:32:08 -0800 Subject: [PATCH 23/59] [r] Conditionally remove aggregation of biosamples.source_datarepo_row_ids in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 3 ++- ...dea02-e274-affe-aabc-eb3db63ad068.results.json | 15 --------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 094c2fd28c..1ce98a3be8 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -42,7 +42,8 @@ class BiosampleAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: if field in { 'biosample_id', - 'document_id' + 'document_id', + 'source_datarepo_row_ids' } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 2dccd702c4..6101d63525 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -28,9 +28,6 @@ ], "biosamples": [ { - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], "anatomical_site": [ "~null" ], @@ -936,9 +933,6 @@ ], "biosamples": [ { - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], "anatomical_site": [ "~null" ], @@ -1879,9 +1873,6 @@ ], "biosamples": [ { - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], "anatomical_site": [ "~null" ], @@ -2825,9 +2816,6 @@ ], "biosamples": [ { - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], "anatomical_site": [ "~null" ], @@ -3342,9 +3330,6 @@ ], "biosamples": [ { - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], "anatomical_site": [ "~null" ], From 0de0507d86e604458a9e24420ec23d228b683fa4 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:41:30 -0800 Subject: [PATCH 24/59] Document need for aggregation of datasets.document_id in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 1ce98a3be8..809edcd1fd 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -57,7 +57,18 @@ def _accumulator(self, field: str) -> Accumulator | None: class DatasetAggregator(SimpleAggregator): - pass + + def _accumulator(self, field: str) -> Accumulator | None: + if field == 'document_id': + # If any dataset IDs are missing from the aggregate, those datasets + # will be omitted during the verbatim handover. Datasets are a "hot" + # entity type, and we can't track their hubs in replica documents, + # so we rely on the inner entity IDs instead. We also need to + # aggregate document_id to allow filtering by the value on + # non-dataset endpoints. + return super()._accumulator(field) + else: + return super()._accumulator(field) class DiagnosisAggregator(SimpleAggregator): From ac5d8cdfd2898bd201143d54a4268280152b2997 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 21 Jan 2026 10:36:03 -0800 Subject: [PATCH 25/59] [r] Remove aggregation of datasets.source_datarepo_row_ids in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 4 ++++ ...dea02-e274-affe-aabc-eb3db63ad068.results.json | 15 --------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 809edcd1fd..7e3245854f 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -67,6 +67,10 @@ def _accumulator(self, field: str) -> Accumulator | None: # aggregate document_id to allow filtering by the value on # non-dataset endpoints. return super()._accumulator(field) + elif field == 'source_datarepo_row_ids' and self.outer_entity_type != 'files': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None else: return super()._accumulator(field) diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 6101d63525..4e02669f6e 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -57,9 +57,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -1902,9 +1899,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -2328,9 +2322,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -2845,9 +2836,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -3359,9 +3347,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], From 8a0c12f440ffc52f5416c04baad7d747944b9380 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:45:15 -0800 Subject: [PATCH 26/59] [r] Conditionally remove aggregation of diagnoses.diagnosis_id in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 6 ++++- ...2-e274-affe-aabc-eb3db63ad068.results.json | 24 ------------------- 2 files changed, 5 insertions(+), 25 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 7e3245854f..e364198353 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -78,7 +78,11 @@ def _accumulator(self, field: str) -> Accumulator | None: class DiagnosisAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field in ('diagnosis_age', 'onset_age'): + if field == 'diagnosis_id' and self.outer_entity_type != 'files': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + elif field in ('diagnosis_age', 'onset_age'): return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), itemgetter('lte', 'gte'))) diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 4e02669f6e..d1b40a2f11 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -92,10 +92,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -991,10 +987,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -1934,10 +1926,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2357,10 +2345,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2871,10 +2855,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -3382,10 +3362,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" From 083710d0691a267799b268fe59a2c76dba5cffaa Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:51:55 -0800 Subject: [PATCH 27/59] [r] Conditionally remove aggregation of diagnoses.document_id in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 5 +++- ...2-e274-affe-aabc-eb3db63ad068.results.json | 24 ------------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index e364198353..0a5ddc0e54 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -78,7 +78,10 @@ def _accumulator(self, field: str) -> Accumulator | None: class DiagnosisAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field == 'diagnosis_id' and self.outer_entity_type != 'files': + if field in { + 'diagnosis_id', + 'document_id' + } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index d1b40a2f11..ada9a42a4d 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -85,10 +85,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -980,10 +976,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -1919,10 +1911,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -2338,10 +2326,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -2848,10 +2832,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -3355,10 +3335,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], From 0dae10f6256a9328a03cbc2d053f3eacaba26f37 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 17:57:03 -0800 Subject: [PATCH 28/59] [r] Conditionally remove aggregation of diagnoses.source_datarepo_row_ids in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 3 ++- ...02-e274-affe-aabc-eb3db63ad068.results.json | 18 ------------------ 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 0a5ddc0e54..14ad0bd713 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -80,7 +80,8 @@ class DiagnosisAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: if field in { 'diagnosis_id', - 'document_id' + 'document_id', + 'source_datarepo_row_ids' } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index ada9a42a4d..9fbc1ffb45 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -85,9 +85,6 @@ ], "diagnoses": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -976,9 +973,6 @@ ], "diagnoses": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -1911,9 +1905,6 @@ ], "diagnoses": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2326,9 +2317,6 @@ ], "diagnoses": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2832,9 +2820,6 @@ ], "diagnoses": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -3335,9 +3320,6 @@ ], "diagnoses": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" From 3b59f005aceb9b83a6e2f445c7b42ed468ee24c4 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:03:30 -0800 Subject: [PATCH 29/59] [r] Conditionally remove aggregation of donors.document_id in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 9 ++++++++- ...dea02-e274-affe-aabc-eb3db63ad068.results.json | 15 --------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 14ad0bd713..f3089ace36 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -95,7 +95,14 @@ def _accumulator(self, field: str) -> Accumulator | None: class DonorAggregator(SimpleAggregator): - pass + + def _accumulator(self, field: str) -> Accumulator | None: + if field == 'document_id' and self.outer_entity_type != 'files': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + else: + return super()._accumulator(field) class FileAggregator(GroupingAggregator): diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 9fbc1ffb45..1ac8191863 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -117,9 +117,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -1005,9 +1002,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -1937,9 +1931,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -2349,9 +2340,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], @@ -2852,9 +2840,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], From 6e5faa1332688924799c107c17ef3b7a9a10eb69 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:09:33 -0800 Subject: [PATCH 30/59] [r] Conditionally remove aggregation of donors.donor_id in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 5 ++++- ...dea02-e274-affe-aabc-eb3db63ad068.results.json | 15 --------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index f3089ace36..36f540c617 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -97,7 +97,10 @@ def _accumulator(self, field: str) -> Accumulator | None: class DonorAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field == 'document_id' and self.outer_entity_type != 'files': + if field in { + 'document_id', + 'donor_id' + } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 1ac8191863..6f9155b622 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -120,9 +120,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -1005,9 +1002,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -1934,9 +1928,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2343,9 +2334,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2843,9 +2831,6 @@ "source_datarepo_row_ids": [ "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], From 955bdcabf1ef38c331af324bc9634da8545a2817 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:15:20 -0800 Subject: [PATCH 31/59] [r] Conditionally remove aggregation of donors.source_datarepo_row_ids in AnVIL (#6793) --- .../plugins/metadata/anvil/indexer/aggregate.py | 3 ++- ...dea02-e274-affe-aabc-eb3db63ad068.results.json | 15 --------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 36f540c617..217bc03296 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -99,7 +99,8 @@ class DonorAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: if field in { 'document_id', - 'donor_id' + 'donor_id', + 'source_datarepo_row_ids' } and self.outer_entity_type != 'files': # These fields are only aggregated for files, where they are needed # for compact and PFB manifests diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 6f9155b622..f0a5085aac 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -117,9 +117,6 @@ ], "donors": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -999,9 +996,6 @@ ], "donors": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -1925,9 +1919,6 @@ ], "donors": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2331,9 +2322,6 @@ ], "donors": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2828,9 +2816,6 @@ ], "donors": [ { - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], "organism_type": [ "redacted-ACw+6ecI" ], From 66a262f6e1d46ff2cf27460bc886e5089802700b Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:19:33 -0800 Subject: [PATCH 32/59] [r] Remove aggregation of files.document_id in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 4 ++- ...2-e274-affe-aabc-eb3db63ad068.results.json | 30 ------------------- 2 files changed, 3 insertions(+), 31 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 217bc03296..12074aaece 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -125,7 +125,9 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return entity['file_format'], def _accumulator(self, field: str) -> Accumulator | None: - if field in ('count', 'file_size'): + if field == 'document_id': + return None + elif field in ('count', 'file_size'): return DistinctAccumulator(SumAccumulator()) else: return super()._accumulator(field) diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index f0a5085aac..a57dd1b7be 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -133,9 +133,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], @@ -1012,9 +1009,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], @@ -1050,9 +1044,6 @@ "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], @@ -1935,9 +1926,6 @@ ], "files": [ { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], @@ -2338,9 +2326,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], @@ -2376,9 +2361,6 @@ "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], @@ -2832,9 +2814,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], @@ -2870,9 +2849,6 @@ "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], @@ -3324,9 +3300,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], @@ -3362,9 +3335,6 @@ "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], From 06cf22c4d97949e74bc43d22754a926db8c1a209 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:23:53 -0800 Subject: [PATCH 33/59] [r] Remove aggregation of files.drs_uri in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 5 ++- ...2-e274-affe-aabc-eb3db63ad068.results.json | 31 ------------------- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 12074aaece..59a9935b3a 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -125,7 +125,10 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return entity['file_format'], def _accumulator(self, field: str) -> Accumulator | None: - if field == 'document_id': + if field in { + 'document_id', + 'drs_uri' + }: return None elif field in ('count', 'file_size'): return DistinctAccumulator(SumAccumulator()) diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index a57dd1b7be..62dddb5ddf 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -162,9 +162,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 } ] @@ -1038,9 +1035,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { @@ -1073,9 +1067,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -1955,9 +1946,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -2355,9 +2343,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { @@ -2390,10 +2375,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -2843,9 +2824,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { @@ -2878,9 +2856,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -3329,9 +3304,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { @@ -3364,9 +3336,6 @@ "version": [ "2022-06-01T00:00:00.000000Z" ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] From 74182a5a31030a4f26439d4b9edbcf24dd4b18bd Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:31:30 -0800 Subject: [PATCH 34/59] [r] Remove aggregation of files.file_id in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 3 +- ...2-e274-affe-aabc-eb3db63ad068.results.json | 30 ------------------- 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 59a9935b3a..e0effde417 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -127,7 +127,8 @@ def _group_keys(self, entity) -> tuple[Any, ...]: def _accumulator(self, field: str) -> Accumulator | None: if field in { 'document_id', - 'drs_uri' + 'drs_uri', + 'file_id' }: return None elif field in ('count', 'file_size'): diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 62dddb5ddf..31b5da5edc 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -136,9 +136,6 @@ "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -1009,9 +1006,6 @@ "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -1041,9 +1035,6 @@ "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -1920,9 +1911,6 @@ "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -2317,9 +2305,6 @@ "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -2349,9 +2334,6 @@ "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -2798,9 +2780,6 @@ "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -2830,9 +2809,6 @@ "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -3278,9 +3254,6 @@ "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -3310,9 +3283,6 @@ "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], From f986a8718ebd59237b6fe285e5aa8936462e1860 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:34:17 -0800 Subject: [PATCH 35/59] [r] Remove aggregation of files.file_md5sum in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 3 +- ...2-e274-affe-aabc-eb3db63ad068.results.json | 30 ------------------- 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index e0effde417..f3339979d5 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -128,7 +128,8 @@ def _accumulator(self, field: str) -> Accumulator | None: if field in { 'document_id', 'drs_uri', - 'file_id' + 'file_id', + 'file_md5sum' }: return None elif field in ('count', 'file_size'): diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 31b5da5edc..885c5913d7 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -144,9 +144,6 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], @@ -1014,9 +1011,6 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], @@ -1043,9 +1037,6 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], @@ -1919,9 +1910,6 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], @@ -2313,9 +2301,6 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], @@ -2342,9 +2327,6 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], @@ -2788,9 +2770,6 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], @@ -2817,9 +2796,6 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], @@ -3262,9 +3238,6 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], @@ -3291,9 +3264,6 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], From 595b4ecd272c09850cc987891ec739d8f8a7a1ac Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:36:22 -0800 Subject: [PATCH 36/59] [r] Remove aggregation of files.file_name in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 3 +- ...2-e274-affe-aabc-eb3db63ad068.results.json | 30 ------------------- 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index f3339979d5..f1b60cf8cf 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -129,7 +129,8 @@ def _accumulator(self, field: str) -> Accumulator | None: 'document_id', 'drs_uri', 'file_id', - 'file_md5sum' + 'file_md5sum', + 'file_name' }: return None elif field in ('count', 'file_size'): diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 885c5913d7..4d64bcdfa3 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -147,9 +147,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], @@ -1014,9 +1011,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], @@ -1040,9 +1034,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], @@ -1913,9 +1904,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], @@ -2304,9 +2292,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], @@ -2330,9 +2315,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], @@ -2773,9 +2755,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], @@ -2799,9 +2778,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], @@ -3241,9 +3217,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], @@ -3267,9 +3240,6 @@ "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], From d849810264999a89e8a782ef17ad2e93ab408105 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:38:23 -0800 Subject: [PATCH 37/59] [r] Remove aggregation of files.source_datarepo_row_ids in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 3 +- ...2-e274-affe-aabc-eb3db63ad068.results.json | 30 ------------------- 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index f1b60cf8cf..42287a0458 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -130,7 +130,8 @@ def _accumulator(self, field: str) -> Accumulator | None: 'drs_uri', 'file_id', 'file_md5sum', - 'file_name' + 'file_name', + 'source_datarepo_row_ids' }: return None elif field in ('count', 'file_size'): diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 4d64bcdfa3..11dda7907d 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -133,9 +133,6 @@ ], "files": [ { - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], "data_modality": [ "~null" ], @@ -997,9 +994,6 @@ ], "files": [ { - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], "data_modality": [ "~null" ], @@ -1020,9 +1014,6 @@ "count": 1 }, { - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], "data_modality": [ "~null" ], @@ -1890,9 +1881,6 @@ ], "files": [ { - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], "data_modality": [ "~null" ], @@ -2278,9 +2266,6 @@ ], "files": [ { - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], "data_modality": [ "~null" ], @@ -2301,9 +2286,6 @@ "count": 1 }, { - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], "data_modality": [ "~null" ], @@ -2741,9 +2723,6 @@ ], "files": [ { - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], "data_modality": [ "~null" ], @@ -2764,9 +2743,6 @@ "count": 1 }, { - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], "data_modality": [ "~null" ], @@ -3203,9 +3179,6 @@ ], "files": [ { - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], "data_modality": [ "~null" ], @@ -3226,9 +3199,6 @@ "count": 1 }, { - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], "data_modality": [ "~null" ], From d7794b1f0b1b6f5d25619677bc6c87e87cd4d0a3 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 16:14:29 -0800 Subject: [PATCH 38/59] [r] Remove aggregation of files.version in AnVIL (#6793) --- .../metadata/anvil/indexer/aggregate.py | 3 +- ...2-e274-affe-aabc-eb3db63ad068.results.json | 30 ------------------- 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 42287a0458..6b6780502f 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -131,7 +131,8 @@ def _accumulator(self, field: str) -> Accumulator | None: 'file_id', 'file_md5sum', 'file_name', - 'source_datarepo_row_ids' + 'source_datarepo_row_ids', + 'version' }: return None elif field in ('count', 'file_size'): diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 11dda7907d..47c6227a9a 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -147,9 +147,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 } ] @@ -1008,9 +1005,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 }, { @@ -1028,9 +1022,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 } ] @@ -1895,9 +1886,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 } ] @@ -2280,9 +2268,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 }, { @@ -2300,9 +2285,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 } ] @@ -2737,9 +2719,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 }, { @@ -2757,9 +2736,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 } ] @@ -3193,9 +3169,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 }, { @@ -3213,9 +3186,6 @@ "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], "count": 1 } ] From b816da377eef8101e9093248c8fcbc3de46b312f Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 21 Jan 2026 10:46:33 -0800 Subject: [PATCH 39/59] [r] Increase accumulator limit for cell_suspensions.biomaterial_id and document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 90fe55e96b..14ed5a3e6f 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -173,7 +173,7 @@ def _accumulator(self, field) -> Accumulator | None: # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': - return super()._accumulator(field) + return SetAccumulator(max_size=int(9766 * 1.25)) else: return None elif field in self.cell_count_fields: From 6f6a24986fe369cc5ef53c5a175b08676d21c464 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 21 Jan 2026 10:48:29 -0800 Subject: [PATCH 40/59] [r] Increase accumulator limit for donors.biomaterial_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 14ed5a3e6f..197932183c 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -209,7 +209,7 @@ def _accumulator(self, field) -> Accumulator | None: # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': - return super()._accumulator(field) + return SetAccumulator(max_size=int(931 * 1.25)) else: return None elif field == 'organism_age_range': From b398f9b8a5901b13c6ac2dc95a95c3a6f506aea4 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:41:40 -0800 Subject: [PATCH 41/59] [r] Increase accumulator limit for donors.development_stage in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 197932183c..e86da4b8ec 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -212,6 +212,8 @@ def _accumulator(self, field) -> Accumulator | None: return SetAccumulator(max_size=int(931 * 1.25)) else: return None + elif field == 'development_stage': + return SetAccumulator(max_size=int(124 * 1.25)) elif field == 'organism_age_range': return SetAccumulator(max_size=100) elif field == 'organism_age': From 91db4d53b99f971ed1ee396e6994d40a0ced0199 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Fri, 9 Jan 2026 11:02:56 -0800 Subject: [PATCH 42/59] [r] Increase accumulator limit for donors.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index e86da4b8ec..5bfbec6ec0 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -230,7 +230,7 @@ def _accumulator(self, field) -> Accumulator | None: # # FIXME: Enforce that hot entity types are completely aggregated # https://github.com/DataBiosphere/azul/issues/6793 - return SetAccumulator(max_size=100) + return SetAccumulator(max_size=int(931 * 1.25)) else: return super()._accumulator(field) From b53a53fa3d6aed046a0289e447ca8f0a5a4bfc9c Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:43:13 -0800 Subject: [PATCH 43/59] [r] Increase accumulator limit for donors.organism_age_range in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 5bfbec6ec0..2589c30606 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -215,7 +215,7 @@ def _accumulator(self, field) -> Accumulator | None: elif field == 'development_stage': return SetAccumulator(max_size=int(124 * 1.25)) elif field == 'organism_age_range': - return SetAccumulator(max_size=100) + return SetAccumulator(max_size=int(107 * 1.25)) elif field == 'organism_age': return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), From a6f4dec965757795eda0528aa8a57608e5068032 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:44:33 -0800 Subject: [PATCH 44/59] [r] Increase accumulator limit for donors.organism_age in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 2589c30606..d701dae331 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -217,7 +217,7 @@ def _accumulator(self, field) -> Accumulator | None: elif field == 'organism_age_range': return SetAccumulator(max_size=int(107 * 1.25)) elif field == 'organism_age': - return SetOfDictAccumulator(max_size=100, + return SetOfDictAccumulator(max_size=int(107 * 1.25), key=compose_keys(none_safe_tuple_key(none_last=True), none_safe_itemgetter('value', 'unit'))) elif field == 'donor_count': From b3c4aa46cc5ceb64fa3beb92531b853a27b8ce8d Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 6 Jan 2026 18:46:52 -0800 Subject: [PATCH 45/59] [r] Increase accumulator limit for matrices.file in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index d701dae331..470e880e40 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -335,7 +335,7 @@ def _accumulator(self, field) -> Accumulator | None: if field == 'document_id': return None elif field == 'file': - return DictAccumulator(max_size=100, key=itemgetter('uuid')) + return DictAccumulator(max_size=int(515 * 1.25), key=itemgetter('uuid')) else: return SetAccumulator() From 2fbbe3458056b9d2ac37da5a8d65d8b066164661 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 20 Jan 2026 10:45:11 -0800 Subject: [PATCH 46/59] [r] Increase accumulator limit for samples.biomaterial_id and document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 470e880e40..9c3befe2fc 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -122,7 +122,7 @@ def _accumulator(self, field) -> Accumulator | None: # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': - return super()._accumulator(field) + return SetAccumulator(max_size=int(1209 * 1.25)) else: return None else: From ba29ce531b2ea085c1758a7d75dc9d3dd5851074 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 21 Jan 2026 10:52:33 -0800 Subject: [PATCH 47/59] [r] Increase accumulator limit for sequencing_inputs.biomaterial_id and document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 9c3befe2fc..416e0440f1 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -305,7 +305,7 @@ def _accumulator(self, field) -> Accumulator | None: # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': - return super()._accumulator(field) + return SetAccumulator(max_size=int(7302 * 1.25)) else: return None else: From 218d75dcd8d982b4f9b4e44ccda210707095a7dc Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 20 Jan 2026 10:52:20 -0800 Subject: [PATCH 48/59] [r] Increase accumulator limit for sequencing_processes.document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 416e0440f1..92af108fd6 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -319,7 +319,7 @@ def _accumulator(self, field) -> Accumulator | None: # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': - return super()._accumulator(field) + return SetAccumulator(max_size=int(6357 * 1.25)) else: return None else: From 528c0a6770bcc659967b1a672cf6c94400ccc2de Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 20 Jan 2026 10:46:37 -0800 Subject: [PATCH 49/59] [r] Increase accumulator limit for specimens.biomaterial_id and document_id in HCA (#6793) --- src/azul/plugins/metadata/hca/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 92af108fd6..775a57f3b5 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -136,7 +136,7 @@ def _accumulator(self, field) -> Accumulator | None: # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': - return super()._accumulator(field) + return SetAccumulator(max_size=int(1209 * 1.25)) # `document_id` is included in the sample aggregate so that the # summary response field `specimenCount` can be calculated. This # should not be a problem since there should only ever be one From b27bf7a11e2503b694ffe1ee21e964ebb442a757 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Fri, 27 Feb 2026 10:14:35 -0800 Subject: [PATCH 50/59] [1/3] Raise exception if aggregation drops values, add overflow option Add overflow option --- src/azul/indexer/aggregate.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/azul/indexer/aggregate.py b/src/azul/indexer/aggregate.py index 6dcb733de4..23668988de 100644 --- a/src/azul/indexer/aggregate.py +++ b/src/azul/indexer/aggregate.py @@ -132,7 +132,9 @@ class SetAccumulator[V: Hashable](Accumulator[V, list[V]]): def __init__(self, max_size: int | None = None, - key: Callable[[V], SupportsRichComparison] | None = None + key: Callable[[V], SupportsRichComparison] | None = None, + *, + allow_overflow: bool = False ) -> None: """ :param max_size: the maximum number of elements to retain @@ -147,6 +149,7 @@ def __init__(self, self.value: set[V] = set() self.max_size = max_size self.key = none_safe_key(none_last=True) if key is None else key + self.allow_overflow = allow_overflow def accumulate(self, value: V | list[V]) -> int: """ @@ -605,8 +608,14 @@ def _aggregate(self, aggregate: Aggregate) -> JSON: if accumulator is not None: result[k] = accumulator.get() if accumulator.dropped > 0: - log.warning('Values were dropped %d times while aggregating %s.%s into %s', - accumulator.dropped, self.entity_type, k, self.outer_entity_type) + message = ( + f'Values were dropped {accumulator.dropped} times while aggregating ' + f'{self.entity_type}.{k} into {self.outer_entity_type}' + ) + if isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow: + log.warning(message) + else: + log.warning(message) return result From 980b0c13f3e91e3b5ca691eb109c62f780a39e16 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 17 Dec 2025 17:07:41 -0800 Subject: [PATCH 51/59] [r 2/3] Raise exception if aggregation drops values, add overflow option Allow accumulator overflow for diagnoses.disease in AnVIL (#6793) --- src/azul/plugins/metadata/anvil/indexer/aggregate.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index 6b6780502f..bce26ccc25 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -9,6 +9,7 @@ Accumulator, DistinctAccumulator, GroupingAggregator, + SetAccumulator, SetOfDictAccumulator, SimpleAggregator, SumAccumulator, @@ -90,6 +91,11 @@ def _accumulator(self, field: str) -> Accumulator | None: return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), itemgetter('lte', 'gte'))) + elif field == 'disease': + return SetAccumulator(max_size=100, + # Some AnVIL datasets have excessive numbers + # of disease values, all being accessions. + allow_overflow=self.outer_entity_type == 'datasets') else: return super()._accumulator(field) From 86763c96124712bf829134f76a48fc338c98e792 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Fri, 27 Feb 2026 10:04:59 -0800 Subject: [PATCH 52/59] [3/3] Raise exception if aggregation drops values, add overflow option Raise exception if values are dropped when overflow option is not set --- src/azul/indexer/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/indexer/aggregate.py b/src/azul/indexer/aggregate.py index 23668988de..8c6bf8fe52 100644 --- a/src/azul/indexer/aggregate.py +++ b/src/azul/indexer/aggregate.py @@ -615,7 +615,7 @@ def _aggregate(self, aggregate: Aggregate) -> JSON: if isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow: log.warning(message) else: - log.warning(message) + assert False, R(message) return result From 43b40bf8b462f961dd49579cc267467354ac28b6 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Fri, 30 Jan 2026 15:13:43 -0800 Subject: [PATCH 53/59] Add comment for need of using getitem() --- src/azul/service/manifest_service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index f43f2d0858..8b165504fb 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -1963,6 +1963,11 @@ def _list_replica_keys(self) -> Iterable[ReplicaKeys]: document_ids = [ document_id for entity_type in self.hot_entity_types + # Some "hot" entity types may be missing from hit['contents'] + # due to the paged request using a document slice (aka an + # Elasticsearch "source filter"). This causes a result that only + # includes fields that match one of "includes" pattern in the + # document slice. See :meth:`_create_pipeline` for inner_entity in getitem(hit['contents'], entity_type, ()) # `document_id` is a scalar (string) when the inner and outer # entity types match, and an array otherwise. `None` should not From fe0ba34354c1813ac5f819af541cb53ba694f015 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Fri, 3 Apr 2026 10:14:00 -0700 Subject: [PATCH 54/59] fixup! Add comment for need of using getitem() --- src/azul/service/manifest_service.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 8b165504fb..0f2c123426 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -1963,11 +1963,12 @@ def _list_replica_keys(self) -> Iterable[ReplicaKeys]: document_ids = [ document_id for entity_type in self.hot_entity_types - # Some "hot" entity types may be missing from hit['contents'] - # due to the paged request using a document slice (aka an - # Elasticsearch "source filter"). This causes a result that only - # includes fields that match one of "includes" pattern in the - # document slice. See :meth:`_create_pipeline` + # Some "hot" entity types may be missing from hit['contents'], + # e.g. `imaging_protocols` if `contents.imaging_protocols` in + # the source document is an empty list. This is due to our + # document slice (aka an Elasticsearch "source filter") limiting + # the results to fields that match the "includes" pattern, and + # have an actual value. See :meth:`_create_pipeline` for inner_entity in getitem(hit['contents'], entity_type, ()) # `document_id` is a scalar (string) when the inner and outer # entity types match, and an array otherwise. `None` should not From 7ec35f104f42e4b9a02f8a4801c5965119d90459 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 4 Mar 2026 09:16:11 -0800 Subject: [PATCH 55/59] Add comment for file aggregate and replica documents relationship --- src/azul/indexer/document.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/azul/indexer/document.py b/src/azul/indexer/document.py index 30707a80f7..c8699a51b0 100644 --- a/src/azul/indexer/document.py +++ b/src/azul/indexer/document.py @@ -94,6 +94,36 @@ def for_entity(cls, catalog: CatalogName, entity: EntityReference): entity_id=entity.entity_id) +# The ID fields that link file aggregates to replica documents: +# +# ┏━━━━━━━━━━━━━━━━━━┓ +# ┃ Project replica ┃ +# ┃ ┃ +# ┣━━━━━━━━━━━━━━━━━━┫ ┏━━━━━━━━━━━━━━━━━━┓ +# ┃ entity_id ┃◀─┐ ┃ File replica ┃ +# ┣━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ┃ +# ┃ hub_ids ┃ │ ┃ File aggregate ┃ ┣━━━━━━━━━━━━━━━━━━┫ +# ┗━━━━━━━━━━━━━━━━━━┛ │ ┃ ┃ ┌─▶┃ entity_id ┃ +# │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┣━━━━━━━━━━━━━━━━━━┫ +# ┏━━━━━━━━━━━━━━━━━━┓ │ ┃ entity_id ┃──┼─▶┃ hub_ids ┃ +# ┃ Donor replica ┃ │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┗━━━━━━━━━━━━━━━━━━┛ +# ┃ ┃ └──┃ contents.projects.document_id ┃ │ +# ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━┓ +# ┃ entity_id ┃◀────┃ contents.donors.document_id ┃ │ ┃ Specimen replica ┃ +# ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┃ ┃ +# ┃ hub_ids ┃ ┌──┃ contents.protocols.document_id ┃ │ ┣━━━━━━━━━━━━━━━━━━┫ +# ┗━━━━━━━━━━━━━━━━━━┛ │ ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ │ ┃ entity_id ┃ +# │ │ ┣━━━━━━━━━━━━━━━━━━┫ +# ┏━━━━━━━━━━━━━━━━━━┓ │ └─▶┃ hub_ids ┃ +# ┃ Protocol replica ┃ │ ┗━━━━━━━━━━━━━━━━━━┛ +# ┃ ┃ │ +# ┣━━━━━━━━━━━━━━━━━━┫ │ +# ┃ entity_id ┃◀─┘ +# ┣━━━━━━━━━━━━━━━━━━┫ +# ┃ hub_ids ┃ +# ┗━━━━━━━━━━━━━━━━━━┛ + + class DocumentType(Enum): contribution = 'contribution' aggregate = 'aggregate' From 325c415dc45835295ef75968fd441f0735a881dc Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 10 Mar 2026 10:09:06 -0700 Subject: [PATCH 56/59] fixup! Add comment for file aggregate and replica documents relationship --- src/azul/indexer/document.py | 30 ------------------- .../plugins/metadata/hca/indexer/transform.py | 27 +++++++++++++++++ 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/azul/indexer/document.py b/src/azul/indexer/document.py index c8699a51b0..30707a80f7 100644 --- a/src/azul/indexer/document.py +++ b/src/azul/indexer/document.py @@ -94,36 +94,6 @@ def for_entity(cls, catalog: CatalogName, entity: EntityReference): entity_id=entity.entity_id) -# The ID fields that link file aggregates to replica documents: -# -# ┏━━━━━━━━━━━━━━━━━━┓ -# ┃ Project replica ┃ -# ┃ ┃ -# ┣━━━━━━━━━━━━━━━━━━┫ ┏━━━━━━━━━━━━━━━━━━┓ -# ┃ entity_id ┃◀─┐ ┃ File replica ┃ -# ┣━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ┃ -# ┃ hub_ids ┃ │ ┃ File aggregate ┃ ┣━━━━━━━━━━━━━━━━━━┫ -# ┗━━━━━━━━━━━━━━━━━━┛ │ ┃ ┃ ┌─▶┃ entity_id ┃ -# │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┣━━━━━━━━━━━━━━━━━━┫ -# ┏━━━━━━━━━━━━━━━━━━┓ │ ┃ entity_id ┃──┼─▶┃ hub_ids ┃ -# ┃ Donor replica ┃ │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┗━━━━━━━━━━━━━━━━━━┛ -# ┃ ┃ └──┃ contents.projects.document_id ┃ │ -# ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━┓ -# ┃ entity_id ┃◀────┃ contents.donors.document_id ┃ │ ┃ Specimen replica ┃ -# ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┃ ┃ -# ┃ hub_ids ┃ ┌──┃ contents.protocols.document_id ┃ │ ┣━━━━━━━━━━━━━━━━━━┫ -# ┗━━━━━━━━━━━━━━━━━━┛ │ ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ │ ┃ entity_id ┃ -# │ │ ┣━━━━━━━━━━━━━━━━━━┫ -# ┏━━━━━━━━━━━━━━━━━━┓ │ └─▶┃ hub_ids ┃ -# ┃ Protocol replica ┃ │ ┗━━━━━━━━━━━━━━━━━━┛ -# ┃ ┃ │ -# ┣━━━━━━━━━━━━━━━━━━┫ │ -# ┃ entity_id ┃◀─┘ -# ┣━━━━━━━━━━━━━━━━━━┫ -# ┃ hub_ids ┃ -# ┗━━━━━━━━━━━━━━━━━━┛ - - class DocumentType(Enum): contribution = 'contribution' aggregate = 'aggregate' diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 465f72d4cf..75b145c0ab 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -1461,6 +1461,33 @@ class FileTransformer(PartitionedTransformer[api.File], ReplicaTransformer): def entity_type(cls) -> str: return 'files' + # ┏━━━━━━━━━━━━━━━━━━┓ + # ┃ Project replica ┃ + # ┃ ┃ + # ┣━━━━━━━━━━━━━━━━━━┫ ┏━━━━━━━━━━━━━━━━━━┓ + # ┃ entity_id ┃◀─┐ ┃ File replica ┃ + # ┣━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ┃ + # ┃ hub_ids ┃ │ ┃ File aggregate ┃ ┣━━━━━━━━━━━━━━━━━━┫ + # ┗━━━━━━━━━━━━━━━━━━┛ │ ┃ ┃ ┌─▶┃ entity_id ┃ + # │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┣━━━━━━━━━━━━━━━━━━┫ + # ┏━━━━━━━━━━━━━━━━━━┓ │ ┃ entity_id ┃──┼─▶┃ hub_ids ┃ + # ┃ Donor replica ┃ │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┗━━━━━━━━━━━━━━━━━━┛ + # ┃ ┃ └──┃ contents.projects.document_id ┃ │ + # ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━┓ + # ┃ entity_id ┃◀────┃ contents.donors.document_id ┃ │ ┃ Specimen replica ┃ + # ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┃ ┃ + # ┃ hub_ids ┃ ┌──┃ contents.protocols.document_id ┃ │ ┣━━━━━━━━━━━━━━━━━━┫ + # ┗━━━━━━━━━━━━━━━━━━┛ │ ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ │ ┃ entity_id ┃ + # │ │ ┣━━━━━━━━━━━━━━━━━━┫ + # ┏━━━━━━━━━━━━━━━━━━┓ │ └─▶┃ hub_ids ┃ + # ┃ Protocol replica ┃ │ ┗━━━━━━━━━━━━━━━━━━┛ + # ┃ ┃ │ + # ┣━━━━━━━━━━━━━━━━━━┫ │ + # ┃ entity_id ┃◀─┘ + # ┣━━━━━━━━━━━━━━━━━━┫ + # ┃ hub_ids ┃ + # ┗━━━━━━━━━━━━━━━━━━┛ + # @classmethod def hot_entity_types(cls) -> dict[EntityType, EntityType]: return { From b7677c8de0c4ac732b6188fca55076297a83bc9e Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 3 Mar 2026 10:39:08 -0800 Subject: [PATCH 57/59] Refactor field exclusions from aggregators --- .../metadata/anvil/indexer/aggregate.py | 56 +++++++++++-------- .../plugins/metadata/hca/indexer/aggregate.py | 22 +++++--- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index bce26ccc25..b968c95c07 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -23,14 +23,28 @@ ) +def _never_accumulate(entity_type: str) -> set[str]: + if entity_type == 'activities': + entity_type = 'activity' + elif entity_type == 'diagnoses': + entity_type = 'diagnosis' + else: + assert entity_type.endswith('s') + entity_type = entity_type[:-1] + return { + entity_type + '_id', + 'document_id', + 'source_datarepo_row_ids' + } + + class ActivityAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field in { - 'activity_id', - 'document_id', - 'source_datarepo_row_ids' - } and self.outer_entity_type != 'files': + if ( + field in _never_accumulate(self.entity_type) + and self.outer_entity_type != 'files' + ): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None @@ -41,11 +55,10 @@ def _accumulator(self, field: str) -> Accumulator | None: class BiosampleAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field in { - 'biosample_id', - 'document_id', - 'source_datarepo_row_ids' - } and self.outer_entity_type != 'files': + if ( + field in _never_accumulate(self.entity_type) + and self.outer_entity_type != 'files' + ): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None @@ -79,11 +92,10 @@ def _accumulator(self, field: str) -> Accumulator | None: class DiagnosisAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field in { - 'diagnosis_id', - 'document_id', - 'source_datarepo_row_ids' - } and self.outer_entity_type != 'files': + if ( + field in _never_accumulate(self.entity_type) + and self.outer_entity_type != 'files' + ): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None @@ -103,11 +115,10 @@ def _accumulator(self, field: str) -> Accumulator | None: class DonorAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field in { - 'document_id', - 'donor_id', - 'source_datarepo_row_ids' - } and self.outer_entity_type != 'files': + if ( + field in _never_accumulate(self.entity_type) + and self.outer_entity_type != 'files' + ): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests return None @@ -131,13 +142,10 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return entity['file_format'], def _accumulator(self, field: str) -> Accumulator | None: - if field in { - 'document_id', + if field in _never_accumulate(self.entity_type) | { 'drs_uri', - 'file_id', 'file_md5sum', 'file_name', - 'source_datarepo_row_ids', 'version' }: return None diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 775a57f3b5..fc1691cbf5 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -48,6 +48,10 @@ ) +def _never_accumulate() -> set[str]: + return {'biomaterial_id', 'document_id'} + + class HCAAggregate(Aggregate): @cached_property @@ -118,7 +122,7 @@ def _default_accumulator(self) -> Accumulator | None: class SampleAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in ('biomaterial_id', 'document_id'): + if field in _never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -132,7 +136,7 @@ def _accumulator(self, field) -> Accumulator | None: class SpecimenAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in ('biomaterial_id', 'document_id'): + if field in _never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -169,7 +173,7 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return frozenset(entity['organ']), def _accumulator(self, field) -> Accumulator | None: - if field in ('biomaterial_id', 'document_id'): + if field in _never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -185,7 +189,7 @@ def _accumulator(self, field) -> Accumulator | None: class CellLineAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == ('biomaterial_id', 'document_id'): + if field in _never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -238,7 +242,7 @@ def _accumulator(self, field) -> Accumulator | None: class OrganoidAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in ('biomaterial_id', 'document_id'): + if field in _never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -301,7 +305,7 @@ def _default_accumulator(self) -> Accumulator | None: class SequencingInputAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in ('biomaterial_id', 'document_id'): + if field in _never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -315,7 +319,7 @@ def _accumulator(self, field) -> Accumulator | None: class SequencingProcessAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'document_id': + if field in _never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -332,7 +336,7 @@ def _default_accumulator(self) -> Accumulator | None: class MatricesAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'document_id': + if field in _never_accumulate(): return None elif field == 'file': return DictAccumulator(max_size=int(515 * 1.25), key=itemgetter('uuid')) @@ -343,7 +347,7 @@ def _accumulator(self, field) -> Accumulator | None: class DateAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'document_id': + if field in _never_accumulate(): return None elif field in ('submission_date', 'aggregate_submission_date'): return MinAccumulator() From b634592077e8dc191ca1a55c4ef8a19ec76644b4 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 7 Apr 2026 10:00:25 -0700 Subject: [PATCH 58/59] fixup! Refactor field exclusions from aggregators --- .../metadata/anvil/indexer/aggregate.py | 53 ++++++++++--------- .../plugins/metadata/hca/indexer/aggregate.py | 46 ++++++++-------- 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index b968c95c07..456ea5ff52 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -23,26 +23,29 @@ ) -def _never_accumulate(entity_type: str) -> set[str]: - if entity_type == 'activities': - entity_type = 'activity' - elif entity_type == 'diagnoses': - entity_type = 'diagnosis' - else: - assert entity_type.endswith('s') - entity_type = entity_type[:-1] - return { - entity_type + '_id', - 'document_id', - 'source_datarepo_row_ids' - } - - -class ActivityAggregator(SimpleAggregator): +class AnVILEntityAggregator(SimpleAggregator): + + def _never_accumulate(self) -> set[str]: + entity_type = self.entity_type + if entity_type == 'activities': + entity_type = 'activity' + elif entity_type == 'diagnoses': + entity_type = 'diagnosis' + else: + assert entity_type.endswith('s') + entity_type = entity_type[:-1] + return { + entity_type + '_id', + 'document_id', + 'source_datarepo_row_ids' + } + + +class ActivityAggregator(AnVILEntityAggregator): def _accumulator(self, field: str) -> Accumulator | None: if ( - field in _never_accumulate(self.entity_type) + field in self._never_accumulate() and self.outer_entity_type != 'files' ): # These fields are only aggregated for files, where they are needed @@ -52,11 +55,11 @@ def _accumulator(self, field: str) -> Accumulator | None: return super()._accumulator(field) -class BiosampleAggregator(SimpleAggregator): +class BiosampleAggregator(AnVILEntityAggregator): def _accumulator(self, field: str) -> Accumulator | None: if ( - field in _never_accumulate(self.entity_type) + field in self._never_accumulate() and self.outer_entity_type != 'files' ): # These fields are only aggregated for files, where they are needed @@ -89,11 +92,11 @@ def _accumulator(self, field: str) -> Accumulator | None: return super()._accumulator(field) -class DiagnosisAggregator(SimpleAggregator): +class DiagnosisAggregator(AnVILEntityAggregator): def _accumulator(self, field: str) -> Accumulator | None: if ( - field in _never_accumulate(self.entity_type) + field in self._never_accumulate() and self.outer_entity_type != 'files' ): # These fields are only aggregated for files, where they are needed @@ -112,11 +115,11 @@ def _accumulator(self, field: str) -> Accumulator | None: return super()._accumulator(field) -class DonorAggregator(SimpleAggregator): +class DonorAggregator(AnVILEntityAggregator): def _accumulator(self, field: str) -> Accumulator | None: if ( - field in _never_accumulate(self.entity_type) + field in self._never_accumulate() and self.outer_entity_type != 'files' ): # These fields are only aggregated for files, where they are needed @@ -126,7 +129,7 @@ def _accumulator(self, field: str) -> Accumulator | None: return super()._accumulator(field) -class FileAggregator(GroupingAggregator): +class FileAggregator(AnVILEntityAggregator, GroupingAggregator): def _transform_entity(self, entity: JSON) -> JSON: file_aggregate_fields = { @@ -142,7 +145,7 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return entity['file_format'], def _accumulator(self, field: str) -> Accumulator | None: - if field in _never_accumulate(self.entity_type) | { + if field in self._never_accumulate() | { 'drs_uri', 'file_md5sum', 'file_name', diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index fc1691cbf5..9b68babba8 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -48,10 +48,6 @@ ) -def _never_accumulate() -> set[str]: - return {'biomaterial_id', 'document_id'} - - class HCAAggregate(Aggregate): @cached_property @@ -86,6 +82,12 @@ def to_json(self) -> JSON: effective_cell_count=self.effective_cell_count) +class HCAEntityAggregator(SimpleAggregator): + + def _never_accumulate(self) -> set[str]: + return {'biomaterial_id', 'document_id'} + + class FileAggregator(GroupingAggregator): def _transform_entity(self, entity: JSON) -> JSON: @@ -119,10 +121,10 @@ def _default_accumulator(self) -> Accumulator | None: return None -class SampleAggregator(SimpleAggregator): +class SampleAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -133,10 +135,10 @@ def _accumulator(self, field) -> Accumulator | None: return super()._accumulator(field) -class SpecimenAggregator(SimpleAggregator): +class SpecimenAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -153,7 +155,7 @@ def _accumulator(self, field) -> Accumulator | None: return super()._accumulator(field) -class CellSuspensionAggregator(GroupingAggregator): +class CellSuspensionAggregator(HCAEntityAggregator, GroupingAggregator): cell_count_fields = frozenset([ 'total_estimated_cells', 'total_estimated_cells_redundant' @@ -173,7 +175,7 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return frozenset(entity['organ']), def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -186,10 +188,10 @@ def _accumulator(self, field) -> Accumulator | None: return super()._accumulator(field) -class CellLineAggregator(SimpleAggregator): +class CellLineAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -239,10 +241,10 @@ def _accumulator(self, field) -> Accumulator | None: return super()._accumulator(field) -class OrganoidAggregator(SimpleAggregator): +class OrganoidAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -302,10 +304,10 @@ def _default_accumulator(self) -> Accumulator | None: return SetAccumulator() -class SequencingInputAggregator(SimpleAggregator): +class SequencingInputAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -316,10 +318,10 @@ def _accumulator(self, field) -> Accumulator | None: return super()._accumulator(field) -class SequencingProcessAggregator(SimpleAggregator): +class SequencingProcessAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): # These fields are only aggregated for files, where they are needed # for compact and PFB manifests if self.outer_entity_type == 'files': @@ -333,10 +335,10 @@ def _default_accumulator(self) -> Accumulator | None: return SetAccumulator(max_size=10) -class MatricesAggregator(SimpleAggregator): +class MatricesAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): return None elif field == 'file': return DictAccumulator(max_size=int(515 * 1.25), key=itemgetter('uuid')) @@ -344,10 +346,10 @@ def _accumulator(self, field) -> Accumulator | None: return SetAccumulator() -class DateAggregator(SimpleAggregator): +class DateAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field in _never_accumulate(): + if field in self._never_accumulate(): return None elif field in ('submission_date', 'aggregate_submission_date'): return MinAccumulator() From b011c59230b891fd8aed38b9eac5f126677d9f6f Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Wed, 8 Apr 2026 18:17:32 -0700 Subject: [PATCH 59/59] Enforce that hot entity ids are fully accumulated (#6793) --- src/azul/indexer/aggregate.py | 26 ++++++++++++++++++- .../metadata/anvil/indexer/transform.py | 4 ++- .../plugins/metadata/hca/indexer/aggregate.py | 6 ----- .../plugins/metadata/hca/indexer/transform.py | 10 ++++++- 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/azul/indexer/aggregate.py b/src/azul/indexer/aggregate.py index 8c6bf8fe52..7d3247388f 100644 --- a/src/azul/indexer/aggregate.py +++ b/src/azul/indexer/aggregate.py @@ -556,9 +556,23 @@ def get(self) -> int: class EntityAggregator(metaclass=ABCMeta): - def __init__(self, outer_entity_type: EntityType, entity_type: EntityType): + def __init__(self, + outer_entity_type: EntityType, + entity_type: EntityType, + strict: bool = False): + """ + :param outer_entity_type: The entity type of the aggregate document. + + :param entity_type: The entity type of the inner entities being + accumulated. + + :param strict: Enforce complete accumulation of `document_id` for the + inner entity type. Required for "hot" entity types, whose + replicas don't track hub IDs. + """ self.outer_entity_type = outer_entity_type self.entity_type = entity_type + self.strict = strict def _transform_entity(self, entity: JSON) -> JSON: return entity @@ -603,6 +617,16 @@ def _accumulate(self, aggregate: Aggregate, entity: JSON) -> None: accumulator.accumulate(value) def _aggregate(self, aggregate: Aggregate) -> JSON: + if self.strict: + accumulator = aggregate.get('document_id') + assert accumulator is not None, R( + 'Hot entity types must always accumulate document_id', + self.entity_type, aggregate.keys() + ) + assert not (isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow), R( + 'allow_overflow is not permitted when accumulating document_id ' + 'in hot entity types', self.entity_type + ) result = {} for k, accumulator in aggregate.items(): if accumulator is not None: diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 35fe77d749..d3fdaf87ec 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -181,7 +181,9 @@ def aggregator(cls, entity_type) -> EntityAggregator: agg_cls = FileAggregator else: assert False, entity_type - return agg_cls(cls.entity_type(), entity_type) + return agg_cls(cls.entity_type(), entity_type, + strict=(issubclass(cls, ReplicaTransformer) + and entity_type in cls.hot_entity_types().values())) def estimate(self, partition: BundlePartition) -> int: # Orphans are not considered when deciding whether to partition the diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 9b68babba8..b94eee8e56 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -233,9 +233,6 @@ def _accumulator(self, field) -> Accumulator | None: # be omitted during the verbatim handover. Donors are a "hot" entity # type, and we can't track their hubs in replica documents, so we # rely on the inner entity IDs instead. - # - # FIXME: Enforce that hot entity types are completely aggregated - # https://github.com/DataBiosphere/azul/issues/6793 return SetAccumulator(max_size=int(931 * 1.25)) else: return super()._accumulator(field) @@ -293,9 +290,6 @@ def _accumulator(self, field) -> Accumulator | None: # protocols may be omitted during the verbatim handover. Some # protocols are "hot" entity types, and we can't track their hubs in # replicas, so we rely on the inner entity IDs instead. - # - # FIXME: Enforce that hot entity types are completely aggregated - # https://github.com/DataBiosphere/azul/issues/6793 return SetAccumulator(max_size=100) else: return super()._accumulator(field) diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 75b145c0ab..e1f3ffcae4 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -511,7 +511,9 @@ def aggregator(cls, entity_type: EntityType) -> EntityAggregator | None: agg_cls = DateAggregator else: agg_cls = SimpleAggregator - return agg_cls(cls.entity_type(), entity_type) + return agg_cls(cls.entity_type(), entity_type, + strict=(issubclass(cls, ReplicaTransformer) + and entity_type in cls.hot_entity_types().values())) def _replica_contents(self, entity: EntityReference) -> JSON: if entity == self.api_bundle.ref: @@ -1577,6 +1579,9 @@ def matrix_stratification_values(self, file: api.File) -> JSON: donor.update( { 'biomaterial_id': f'donor_organism_{file_name}', + # Donors are a hot entity type, so they are required + # to have a document_id. + 'document_id': f'donor_organism_{file_name}', } ) contents['donors'].append(donor) @@ -1594,6 +1599,9 @@ def matrix_stratification_values(self, file: api.File) -> JSON: if library is not None: contents['library_preparation_protocols'].append( { + # Library preparation protocols are a hot entity + # type, so they are required to have a document_id. + 'document_id': f'library_preparation_protocol_{file_name}', 'library_construction_approach': json_sorted(library), } )