diff --git a/lambdas/service/app.py b/lambdas/service/app.py index e4728bd61a..a56d4af63d 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -55,7 +55,7 @@ # changes and reset the minor version to zero. Otherwise, increment only # the minor version for backwards compatible changes. A backwards # compatible change is one that does not require updates to clients. - 'version': '16.1', + 'version': '17.0', 'description': fd(f''' # Overview diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index a0cc93b94c..a48212a36e 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -2,7 +2,7 @@ "openapi": "3.0.1", "info": { "title": "azul-service-dev", - "version": "16.1", + "version": "17.0", "description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n\n\n## Contact us\n\nFor technical support please file an issue at\n[GitHub](https://github.com/DataBiosphere/azul/issues) or email\n`azul-group@ucsc.edu`. To report a security concern or misconduct please email\n`azul-group@ucsc.edu`.\n" }, "tags": [ @@ -2483,23 +2483,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -2778,7 +2761,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -2867,7 +2850,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -4367,23 +4349,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -4662,7 +4627,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -4751,7 +4716,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -6251,23 +6215,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -6546,7 +6493,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -6635,7 +6582,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -8025,23 +7971,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -8320,7 +8249,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] }, @@ -9713,23 +9642,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -10008,7 +9920,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] } @@ -11359,23 +11271,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -11654,7 +11549,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", @@ -13110,23 +13005,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - }, - "minItems": 1 - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -13405,7 +13283,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, an operator, and an array of field\nvalues. The available operators are \"is\", \"within\", \"contains\", and\n\"intersects\". Multiple filters are combined using \"and\" logic. For an\nentity to be included in the response, it must match all filters. How\nmultiple field values within a single filter are combined depends on the\noperator.\n\nFor the \"is\" operator, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" operators, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", diff --git a/src/azul/indexer/aggregate.py b/src/azul/indexer/aggregate.py index 6dcb733de4..7d3247388f 100644 --- a/src/azul/indexer/aggregate.py +++ b/src/azul/indexer/aggregate.py @@ -132,7 +132,9 @@ class SetAccumulator[V: Hashable](Accumulator[V, list[V]]): def __init__(self, max_size: int | None = None, - key: Callable[[V], SupportsRichComparison] | None = None + key: Callable[[V], SupportsRichComparison] | None = None, + *, + allow_overflow: bool = False ) -> None: """ :param max_size: the maximum number of elements to retain @@ -147,6 +149,7 @@ def __init__(self, self.value: set[V] = set() self.max_size = max_size self.key = none_safe_key(none_last=True) if key is None else key + self.allow_overflow = allow_overflow def accumulate(self, value: V | list[V]) -> int: """ @@ -553,9 +556,23 @@ def get(self) -> int: class EntityAggregator(metaclass=ABCMeta): - def __init__(self, outer_entity_type: EntityType, entity_type: EntityType): + def __init__(self, + outer_entity_type: EntityType, + entity_type: EntityType, + strict: bool = False): + """ + :param outer_entity_type: The entity type of the aggregate document. + + :param entity_type: The entity type of the inner entities being + accumulated. + + :param strict: Enforce complete accumulation of `document_id` for the + inner entity type. Required for "hot" entity types, whose + replicas don't track hub IDs. + """ self.outer_entity_type = outer_entity_type self.entity_type = entity_type + self.strict = strict def _transform_entity(self, entity: JSON) -> JSON: return entity @@ -600,13 +617,29 @@ def _accumulate(self, aggregate: Aggregate, entity: JSON) -> None: accumulator.accumulate(value) def _aggregate(self, aggregate: Aggregate) -> JSON: + if self.strict: + accumulator = aggregate.get('document_id') + assert accumulator is not None, R( + 'Hot entity types must always accumulate document_id', + self.entity_type, aggregate.keys() + ) + assert not (isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow), R( + 'allow_overflow is not permitted when accumulating document_id ' + 'in hot entity types', self.entity_type + ) result = {} for k, accumulator in aggregate.items(): if accumulator is not None: result[k] = accumulator.get() if accumulator.dropped > 0: - log.warning('Values were dropped %d times while aggregating %s.%s into %s', - accumulator.dropped, self.entity_type, k, self.outer_entity_type) + message = ( + f'Values were dropped {accumulator.dropped} times while aggregating ' + f'{self.entity_type}.{k} into {self.outer_entity_type}' + ) + if isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow: + log.warning(message) + else: + assert False, R(message) return result diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index e7deaad221..456ea5ff52 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -9,6 +9,7 @@ Accumulator, DistinctAccumulator, GroupingAggregator, + SetAccumulator, SetOfDictAccumulator, SimpleAggregator, SumAccumulator, @@ -22,14 +23,49 @@ ) -class ActivityAggregator(SimpleAggregator): - pass +class AnVILEntityAggregator(SimpleAggregator): + + def _never_accumulate(self) -> set[str]: + entity_type = self.entity_type + if entity_type == 'activities': + entity_type = 'activity' + elif entity_type == 'diagnoses': + entity_type = 'diagnosis' + else: + assert entity_type.endswith('s') + entity_type = entity_type[:-1] + return { + entity_type + '_id', + 'document_id', + 'source_datarepo_row_ids' + } + + +class ActivityAggregator(AnVILEntityAggregator): + + def _accumulator(self, field: str) -> Accumulator | None: + if ( + field in self._never_accumulate() + and self.outer_entity_type != 'files' + ): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + else: + return super()._accumulator(field) -class BiosampleAggregator(SimpleAggregator): +class BiosampleAggregator(AnVILEntityAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field == 'donor_age_at_collection': + if ( + field in self._never_accumulate() + and self.outer_entity_type != 'files' + ): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + elif field == 'donor_age_at_collection': return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), itemgetter('lte', 'gte'))) @@ -38,25 +74,62 @@ def _accumulator(self, field: str) -> Accumulator | None: class DatasetAggregator(SimpleAggregator): - pass + + def _accumulator(self, field: str) -> Accumulator | None: + if field == 'document_id': + # If any dataset IDs are missing from the aggregate, those datasets + # will be omitted during the verbatim handover. Datasets are a "hot" + # entity type, and we can't track their hubs in replica documents, + # so we rely on the inner entity IDs instead. We also need to + # aggregate document_id to allow filtering by the value on + # non-dataset endpoints. + return super()._accumulator(field) + elif field == 'source_datarepo_row_ids' and self.outer_entity_type != 'files': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + else: + return super()._accumulator(field) -class DiagnosisAggregator(SimpleAggregator): +class DiagnosisAggregator(AnVILEntityAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field in ('diagnosis_age', 'onset_age'): + if ( + field in self._never_accumulate() + and self.outer_entity_type != 'files' + ): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + elif field in ('diagnosis_age', 'onset_age'): return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), itemgetter('lte', 'gte'))) + elif field == 'disease': + return SetAccumulator(max_size=100, + # Some AnVIL datasets have excessive numbers + # of disease values, all being accessions. + allow_overflow=self.outer_entity_type == 'datasets') else: return super()._accumulator(field) -class DonorAggregator(SimpleAggregator): - pass +class DonorAggregator(AnVILEntityAggregator): + + def _accumulator(self, field: str) -> Accumulator | None: + if ( + field in self._never_accumulate() + and self.outer_entity_type != 'files' + ): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + return None + else: + return super()._accumulator(field) -class FileAggregator(GroupingAggregator): +class FileAggregator(AnVILEntityAggregator, GroupingAggregator): def _transform_entity(self, entity: JSON) -> JSON: file_aggregate_fields = { @@ -72,7 +145,14 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return entity['file_format'], def _accumulator(self, field: str) -> Accumulator | None: - if field in ('count', 'file_size'): + if field in self._never_accumulate() | { + 'drs_uri', + 'file_md5sum', + 'file_name', + 'version' + }: + return None + elif field in ('count', 'file_size'): return DistinctAccumulator(SumAccumulator()) else: return super()._accumulator(field) diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 35fe77d749..d3fdaf87ec 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -181,7 +181,9 @@ def aggregator(cls, entity_type) -> EntityAggregator: agg_cls = FileAggregator else: assert False, entity_type - return agg_cls(cls.entity_type(), entity_type) + return agg_cls(cls.entity_type(), entity_type, + strict=(issubclass(cls, ReplicaTransformer) + and entity_type in cls.hot_entity_types().values())) def estimate(self, partition: BundlePartition) -> int: # Orphans are not considered when deciding whether to partition the diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 7f554f351f..63c2ab2c73 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -186,7 +186,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]: files=Sorting(field_name='fileName'), projects=Sorting(field_name='projectTitle', max_page_size=75), - samples=Sorting(field_name='sampleId') + samples=Sorting(field_name='entryId') ) @property @@ -279,7 +279,6 @@ def _field_mapping(self) -> InverseFieldMapping: 'donor_count': 'donorCount' }, 'samples': { - 'biomaterial_id': 'sampleId', 'entity_type': 'sampleEntityType', 'organ': 'organ', 'organ_part': 'organPart', diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index f92cc5b169..b94eee8e56 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -82,6 +82,12 @@ def to_json(self) -> JSON: effective_cell_count=self.effective_cell_count) +class HCAEntityAggregator(SimpleAggregator): + + def _never_accumulate(self) -> set[str]: + return {'biomaterial_id', 'document_id'} + + class FileAggregator(GroupingAggregator): def _transform_entity(self, entity: JSON) -> JSON: @@ -115,15 +121,41 @@ def _default_accumulator(self) -> Accumulator | None: return None -class SampleAggregator(SimpleAggregator): - pass +class SampleAggregator(HCAEntityAggregator): + + def _accumulator(self, field) -> Accumulator | None: + if field in self._never_accumulate(): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return SetAccumulator(max_size=int(1209 * 1.25)) + else: + return None + else: + return super()._accumulator(field) + +class SpecimenAggregator(HCAEntityAggregator): -class SpecimenAggregator(SimpleAggregator): - pass + def _accumulator(self, field) -> Accumulator | None: + if field in self._never_accumulate(): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return SetAccumulator(max_size=int(1209 * 1.25)) + # `document_id` is included in the sample aggregate so that the + # summary response field `specimenCount` can be calculated. This + # should not be a problem since there should only ever be one + # specimen inner entity in a samples outer entity. + elif field == 'document_id' and self.outer_entity_type == 'samples': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) -class CellSuspensionAggregator(GroupingAggregator): +class CellSuspensionAggregator(HCAEntityAggregator, GroupingAggregator): cell_count_fields = frozenset([ 'total_estimated_cells', 'total_estimated_cells_redundant' @@ -143,14 +175,31 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return frozenset(entity['organ']), def _accumulator(self, field) -> Accumulator | None: - if field in self.cell_count_fields: + if field in self._never_accumulate(): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return SetAccumulator(max_size=int(9766 * 1.25)) + else: + return None + elif field in self.cell_count_fields: return DistinctAccumulator(SumAccumulator()) else: return super()._accumulator(field) -class CellLineAggregator(SimpleAggregator): - pass +class CellLineAggregator(HCAEntityAggregator): + + def _accumulator(self, field) -> Accumulator | None: + if field in self._never_accumulate(): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class DonorOrganismAggregator(SimpleAggregator): @@ -162,10 +211,19 @@ def _transform_entity(self, entity: JSON) -> JSON: } def _accumulator(self, field) -> Accumulator | None: - if field == 'organism_age_range': - return SetAccumulator(max_size=100) + if field == 'biomaterial_id': + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return SetAccumulator(max_size=int(931 * 1.25)) + else: + return None + elif field == 'development_stage': + return SetAccumulator(max_size=int(124 * 1.25)) + elif field == 'organism_age_range': + return SetAccumulator(max_size=int(107 * 1.25)) elif field == 'organism_age': - return SetOfDictAccumulator(max_size=100, + return SetOfDictAccumulator(max_size=int(107 * 1.25), key=compose_keys(none_safe_tuple_key(none_last=True), none_safe_itemgetter('value', 'unit'))) elif field == 'donor_count': @@ -175,22 +233,35 @@ def _accumulator(self, field) -> Accumulator | None: # be omitted during the verbatim handover. Donors are a "hot" entity # type, and we can't track their hubs in replica documents, so we # rely on the inner entity IDs instead. - # - # FIXME: Enforce that hot entity types are completely aggregated - # https://github.com/DataBiosphere/azul/issues/6793 - return SetAccumulator(max_size=100) + return SetAccumulator(max_size=int(931 * 1.25)) else: return super()._accumulator(field) -class OrganoidAggregator(SimpleAggregator): - pass +class OrganoidAggregator(HCAEntityAggregator): + + def _accumulator(self, field) -> Accumulator | None: + if field in self._never_accumulate(): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class ProjectAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: if field == 'document_id': + # If any project IDs are missing from the aggregate, those projects + # will be omitted during the verbatim handover. Projects are a "hot" + # entity type, and we can't track their hubs in replica documents, + # so we rely on the inner entity IDs instead. We also need to + # aggregate `document_id` to allow filtering by `projectId` on + # non-project endpoints. return SetAccumulator(max_size=100) elif field in ('project_description', 'contact_names', @@ -219,9 +290,6 @@ def _accumulator(self, field) -> Accumulator | None: # protocols may be omitted during the verbatim handover. Some # protocols are "hot" entity types, and we can't track their hubs in # replicas, so we rely on the inner entity IDs instead. - # - # FIXME: Enforce that hot entity types are completely aggregated - # https://github.com/DataBiosphere/azul/issues/6793 return SetAccumulator(max_size=100) else: return super()._accumulator(field) @@ -230,31 +298,52 @@ def _default_accumulator(self) -> Accumulator | None: return SetAccumulator() -class SequencingInputAggregator(SimpleAggregator): - pass +class SequencingInputAggregator(HCAEntityAggregator): + + def _accumulator(self, field) -> Accumulator | None: + if field in self._never_accumulate(): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return SetAccumulator(max_size=int(7302 * 1.25)) + else: + return None + else: + return super()._accumulator(field) -class SequencingProcessAggregator(SimpleAggregator): +class SequencingProcessAggregator(HCAEntityAggregator): + + def _accumulator(self, field) -> Accumulator | None: + if field in self._never_accumulate(): + # These fields are only aggregated for files, where they are needed + # for compact and PFB manifests + if self.outer_entity_type == 'files': + return SetAccumulator(max_size=int(6357 * 1.25)) + else: + return None + else: + return super()._accumulator(field) def _default_accumulator(self) -> Accumulator | None: return SetAccumulator(max_size=10) -class MatricesAggregator(SimpleAggregator): +class MatricesAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'document_id': + if field in self._never_accumulate(): return None elif field == 'file': - return DictAccumulator(max_size=100, key=itemgetter('uuid')) + return DictAccumulator(max_size=int(515 * 1.25), key=itemgetter('uuid')) else: return SetAccumulator() -class DateAggregator(SimpleAggregator): +class DateAggregator(HCAEntityAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'document_id': + if field in self._never_accumulate(): return None elif field in ('submission_date', 'aggregate_submission_date'): return MinAccumulator() diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 465f72d4cf..e1f3ffcae4 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -511,7 +511,9 @@ def aggregator(cls, entity_type: EntityType) -> EntityAggregator | None: agg_cls = DateAggregator else: agg_cls = SimpleAggregator - return agg_cls(cls.entity_type(), entity_type) + return agg_cls(cls.entity_type(), entity_type, + strict=(issubclass(cls, ReplicaTransformer) + and entity_type in cls.hot_entity_types().values())) def _replica_contents(self, entity: EntityReference) -> JSON: if entity == self.api_bundle.ref: @@ -1461,6 +1463,33 @@ class FileTransformer(PartitionedTransformer[api.File], ReplicaTransformer): def entity_type(cls) -> str: return 'files' + # ┏━━━━━━━━━━━━━━━━━━┓ + # ┃ Project replica ┃ + # ┃ ┃ + # ┣━━━━━━━━━━━━━━━━━━┫ ┏━━━━━━━━━━━━━━━━━━┓ + # ┃ entity_id ┃◀─┐ ┃ File replica ┃ + # ┣━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ┃ + # ┃ hub_ids ┃ │ ┃ File aggregate ┃ ┣━━━━━━━━━━━━━━━━━━┫ + # ┗━━━━━━━━━━━━━━━━━━┛ │ ┃ ┃ ┌─▶┃ entity_id ┃ + # │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┣━━━━━━━━━━━━━━━━━━┫ + # ┏━━━━━━━━━━━━━━━━━━┓ │ ┃ entity_id ┃──┼─▶┃ hub_ids ┃ + # ┃ Donor replica ┃ │ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┗━━━━━━━━━━━━━━━━━━┛ + # ┃ ┃ └──┃ contents.projects.document_id ┃ │ + # ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┏━━━━━━━━━━━━━━━━━━┓ + # ┃ entity_id ┃◀────┃ contents.donors.document_id ┃ │ ┃ Specimen replica ┃ + # ┣━━━━━━━━━━━━━━━━━━┫ ┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ │ ┃ ┃ + # ┃ hub_ids ┃ ┌──┃ contents.protocols.document_id ┃ │ ┣━━━━━━━━━━━━━━━━━━┫ + # ┗━━━━━━━━━━━━━━━━━━┛ │ ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ │ ┃ entity_id ┃ + # │ │ ┣━━━━━━━━━━━━━━━━━━┫ + # ┏━━━━━━━━━━━━━━━━━━┓ │ └─▶┃ hub_ids ┃ + # ┃ Protocol replica ┃ │ ┗━━━━━━━━━━━━━━━━━━┛ + # ┃ ┃ │ + # ┣━━━━━━━━━━━━━━━━━━┫ │ + # ┃ entity_id ┃◀─┘ + # ┣━━━━━━━━━━━━━━━━━━┫ + # ┃ hub_ids ┃ + # ┗━━━━━━━━━━━━━━━━━━┛ + # @classmethod def hot_entity_types(cls) -> dict[EntityType, EntityType]: return { @@ -1550,6 +1579,9 @@ def matrix_stratification_values(self, file: api.File) -> JSON: donor.update( { 'biomaterial_id': f'donor_organism_{file_name}', + # Donors are a hot entity type, so they are required + # to have a document_id. + 'document_id': f'donor_organism_{file_name}', } ) contents['donors'].append(donor) @@ -1567,6 +1599,9 @@ def matrix_stratification_values(self, file: api.File) -> JSON: if library is not None: contents['library_preparation_protocols'].append( { + # Library preparation protocols are a hot entity + # type, so they are required to have a document_id. + 'document_id': f'library_preparation_protocol_{file_name}', 'library_construction_approach': json_sorted(library), } ) diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index 768faebff0..8ae7d64215 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -429,7 +429,6 @@ def make_file(self, source: SourceSpec, file: JSON) -> JSON: def make_specimen(self, specimen) -> MutableJSON: return { - 'id': specimen['biomaterial_id'], 'organ': specimen.get('organ', None), 'organPart': specimen.get('organ_part', None), 'disease': specimen.get('disease', None), @@ -459,7 +458,6 @@ def make_cell_suspensions(self, entry) -> MutableJSONs: def make_cell_line(self, cell_line) -> MutableJSON: return { - 'id': cell_line['biomaterial_id'], 'cellLineType': cell_line.get('cell_line_type', None), 'modelOrgan': cell_line.get('model_organ', None), } @@ -469,7 +467,6 @@ def make_cell_lines(self, entry) -> MutableJSONs: def make_donor(self, donor) -> MutableJSON: return { - 'id': donor['biomaterial_id'], 'donorCount': donor.get('donor_count', None), 'developmentStage': donor.get('development_stage', None), 'genusSpecies': donor.get('genus_species', None), @@ -484,7 +481,6 @@ def make_donors(self, entry) -> MutableJSONs: def make_organoid(self, organoid) -> MutableJSON: return { - 'id': organoid['biomaterial_id'], 'modelOrgan': organoid.get('model_organ', None), 'modelOrganPart': organoid.get('model_organ_part', None) } @@ -493,11 +489,12 @@ def make_organoids(self, entry) -> MutableJSONs: return [self.make_organoid(organoid) for organoid in entry['contents']['organoids']] def make_sample(self, sample, entity_dict, entity_type) -> MutableJSON: - is_aggregate = isinstance(sample['document_id'], list) organ_prop = 'organ' if entity_type == 'specimens' else 'model_organ' + effective_organ = sample[organ_prop] + is_aggregate = isinstance(effective_organ, list) return { 'sampleEntityType': [entity_type] if is_aggregate else entity_type, - 'effectiveOrgan': sample[organ_prop], + 'effectiveOrgan': effective_organ, **entity_dict } diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index f43f2d0858..0f2c123426 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -1963,6 +1963,12 @@ def _list_replica_keys(self) -> Iterable[ReplicaKeys]: document_ids = [ document_id for entity_type in self.hot_entity_types + # Some "hot" entity types may be missing from hit['contents'], + # e.g. `imaging_protocols` if `contents.imaging_protocols` in + # the source document is an empty list. This is due to our + # document slice (aka an Elasticsearch "source filter") limiting + # the results to fields that match the "includes" pattern, and + # have an actual value. See :meth:`_create_pipeline` for inner_entity in getitem(hit['contents'], entity_type, ()) # `document_id` is a scalar (string) when the inner and outer # entity types match, and an array otherwise. `None` should not diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 30b2d92fed..47c6227a9a 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -28,15 +28,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -66,9 +57,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -97,17 +85,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -140,15 +117,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -165,15 +133,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -182,24 +141,12 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 } ] @@ -923,18 +870,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -954,15 +889,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -1017,17 +943,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -1060,15 +975,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -1085,15 +991,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -1102,36 +999,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -1140,24 +1016,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -1903,15 +1767,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -1941,9 +1796,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -1972,17 +1824,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2015,15 +1856,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2040,15 +1872,6 @@ ], "files": [ { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -2057,24 +1880,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -2324,18 +2135,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2379,9 +2178,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -2410,17 +2206,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2453,15 +2238,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2478,15 +2254,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -2495,36 +2262,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -2533,25 +2279,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -2848,18 +2581,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2879,15 +2600,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -2917,9 +2629,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -2948,17 +2657,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2991,15 +2689,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -3016,15 +2705,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -3033,36 +2713,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -3071,24 +2730,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -3383,18 +3030,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -3414,15 +3049,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -3452,9 +3078,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -3483,17 +3106,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -3543,15 +3155,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -3560,36 +3163,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -3598,24 +3180,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index a77d93573b..00970feaa9 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1402,12 +1402,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -1436,12 +1430,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -1464,23 +1452,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -1490,12 +1468,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -1518,12 +1490,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], - "biomaterial_id": [ - "GSM2172585 1" - ], "total_estimated_cells_redundant": 0, "total_estimated_cells_redundant_": 0, "total_estimated_cells": 1, @@ -1545,9 +1511,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "biological_sex": [ @@ -2376,23 +2339,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -2405,9 +2358,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2435,9 +2385,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "genus_species": [ @@ -2465,12 +2412,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], - "biomaterial_id": [ - "GSM2172585 1" - ], "selected_cell_type": [ "~null" ], @@ -2609,12 +2550,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -2643,12 +2578,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2671,23 +2600,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -2697,12 +2616,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2730,9 +2643,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "genus_species": [ @@ -2760,12 +2670,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], - "biomaterial_id": [ - "GSM2172585 1" - ], "selected_cell_type": [ "~null" ], @@ -3208,12 +3112,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -3242,12 +3140,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -3270,23 +3162,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -3296,12 +3178,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -3347,9 +3223,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "biological_sex": [ diff --git a/test/indexer/test_indexer.py b/test/indexer/test_indexer.py index 2669962f9b..f18912d5cf 100644 --- a/test/indexer/test_indexer.py +++ b/test/indexer/test_indexer.py @@ -1932,15 +1932,13 @@ def test_cell_line_sample(self): if qualifier == 'samples': sample = one(contents['samples']) sample_entity_type = sample['entity_type'] - if aggregate: - document_ids = one(contents[sample_entity_type])['document_id'] - elif contribution: + if contribution: document_ids = [d['document_id'] for d in contents[sample_entity_type]] + self.assertIn(sample['document_id'], document_ids) entity = one(d for d in contents[sample_entity_type] if d['document_id'] == sample['document_id']) self.assertEqual(sample['biomaterial_id'], entity['biomaterial_id']) else: - assert False, doc_type - self.assertIn(sample['document_id'], document_ids) + assert aggregate, doc_type self.assertEqual(one(contents['specimens'])['organ'], ['blood'] if aggregate else 'blood') self.assertEqual(one(contents['specimens'])['organ_part'], ['venous blood']) self.assertEqual(len(contents['cell_lines']), 1 if aggregate else 2) @@ -2027,6 +2025,7 @@ def test_sample_with_no_donor(self): k: (v if isinstance(v, list) else [v]) + ([] if k == 'organism_age_range' or True else [None]) for k, v in donor.items() + if k != 'biomaterial_id' } } hits = self._get_all_hits() diff --git a/test/service/test_app_logging.py b/test/service/test_app_logging.py index 15c2aab84d..cacd2f6e1d 100644 --- a/test/service/test_app_logging.py +++ b/test/service/test_app_logging.py @@ -153,7 +153,7 @@ def filter_body(organ: str) -> JSON: elif debug == 1: expected_log = f'… with a response body starting in {body[:prefix_len]}' elif debug > 1: - expected_log = f'… with a response body of length 9137 being {body}' + expected_log = f'… with a response body of length 9050 being {body}' else: assert False self.assertEqual(expected_log, body_log_message) diff --git a/test/service/test_response.py b/test/service/test_response.py index 9ef07e8adb..a4d359c260 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -269,7 +269,6 @@ def test_response_stage_files(self): 'disease': ['normal'], 'developmentStage': [None], 'genusSpecies': ['Australopithecus'], - 'id': ['DID_scRSq06'], 'donorCount': 1, 'organismAge': [{'value': '38', 'unit': 'year'}], 'organismAgeRange': [[1198368000.0, 1198368000.0]], @@ -329,7 +328,6 @@ def test_response_stage_files(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -346,7 +344,6 @@ def test_response_stage_files(self): 'specimens': [ { 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -537,7 +534,6 @@ def test_response_stage_projects(self): 'disease': ['normal'], 'developmentStage': [None], 'genusSpecies': ['Australopithecus'], - 'id': ['DID_scRSq06'], 'donorCount': 1, 'organismAge': [{'value': '38', 'unit': 'year'}], 'organismAgeRange': [[1198368000.0, 1198368000.0]], @@ -656,7 +652,6 @@ def test_response_stage_projects(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -673,7 +668,6 @@ def test_response_stage_projects(self): 'specimens': [ { 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -768,7 +762,6 @@ def test_response_stage_projects_accessions(self): 'disease': ['H syndrome'], 'developmentStage': ['human adult stage'], 'genusSpecies': ['Homo sapiens'], - 'id': ['donor_ID_1'], 'donorCount': 1, 'organismAge': [{'value': '20', 'unit': 'year'}], 'organismAgeRange': [[630720000.0, 630720000.0]], @@ -923,7 +916,6 @@ def test_response_stage_projects_accessions(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['brain'], 'disease': ['H syndrome'], - 'id': ['specimen_ID_1'], 'organ': ['brain'], 'organPart': ['amygdala'], 'preservationMethod': [None], @@ -940,7 +932,6 @@ def test_response_stage_projects_accessions(self): 'specimens': [ { 'disease': ['H syndrome'], - 'id': ['specimen_ID_1'], 'organ': ['brain'], 'organPart': ['amygdala'], 'preservationMethod': [None], @@ -982,7 +973,6 @@ def test_response_stage_projects_cell_line(self): stage = self._response_stage('projects') response = stage.process_response((hits, self.paginations[0], {})) expected_cell_lines = { - 'id': ['cell_line_Day7_hiPSC-CM_BioRep2', 'cell_line_GM18517'], 'cellLineType': ['primary', 'stem cell-derived'], 'modelOrgan': ['blood (parent_cell_line)', 'blood (child_cell_line)'], } @@ -992,7 +982,6 @@ def test_response_stage_projects_cell_line(self): expected_samples = { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['blood (child_cell_line)'], - 'id': ['cell_line_Day7_hiPSC-CM_BioRep2'], 'cellLineType': ['stem cell-derived'], 'modelOrgan': ['blood (child_cell_line)'], } @@ -1225,12 +1214,6 @@ def test_ranged_values(self): 'genusSpecies': [ 'Homo sapiens' ], - 'id': [ - 'HPSI0314i-hoik', - 'HPSI0214i-wibj', - 'HPSI0314i-sojd', - 'HPSI0214i-kucg' - ], 'donorCount': 4, 'organismAge': [ {'value': '45-49', 'unit': 'year'}, @@ -1253,12 +1236,6 @@ def test_ranged_values(self): 'genusSpecies': [ 'Homo sapiens' ], - 'id': [ - 'HPSI0314i-hoik', - 'HPSI0214i-wibj', - 'HPSI0314i-sojd', - 'HPSI0214i-kucg' - ], 'donorCount': 4, 'organismAge': [ {'value': '40-44', 'unit': 'year'}, @@ -2087,7 +2064,7 @@ def test_bad_search_after_search_before(self): """ Test that invalid JSON for search_after or search_before raise a 400 """ - query_params = self._params(size=1, sort='sampleId', order='asc') + query_params = self._params(size=1, sort='entryId', order='asc') url = self.base_url.set(path='/index/samples', args=query_params) # Get page 1 response = requests.get(str(url)) @@ -2392,14 +2369,12 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['immune system'], - 'id': ['Cell_line_2'], 'cellLineType': ['primary'], 'modelOrgan': ['immune system'], }, { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['embryo'], - 'id': ['Specimen1'], 'organ': ['embryo'], 'organPart': ['skin epidermis'], 'disease': ['normal'], @@ -2414,12 +2389,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['organoids'], 'effectiveOrgan': ['Brain'], - 'id': [ - 'Org_HPSI0214i-kucg_2_2', - 'Org_HPSI0214i-wibj_2_2', - 'Org_HPSI0314i-hoik_1_2', - 'Org_HPSI0314i-sojd_3_2', - ], 'modelOrgan': ['Brain'], 'modelOrganPart': [None], } @@ -2432,14 +2401,12 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['immune system'], - 'id': ['Cell_line_2'], 'cellLineType': ['primary'], 'modelOrgan': ['immune system'], }, { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['embryo'], - 'id': ['Specimen1'], 'organ': ['embryo'], 'organPart': ['skin epidermis'], 'disease': ['normal'], @@ -2451,7 +2418,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'disease': ['normal'], @@ -3724,7 +3690,7 @@ def test(self): 'default_order': 'asc' }, 'samples': { - 'default_sort': 'sampleId', + 'default_sort': 'entryId', 'default_order': 'asc' } }