Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
0d49582
[A] Bump service lambda version
dsotirho-ucsc Mar 10, 2026
873e675
[A] Remove sort & filter key sampleId from HCA (#6793)
dsotirho-ucsc Jan 5, 2026
3c59cef
Document need for aggregation of projects.document_id in HCA (#6793)
dsotirho-ucsc Jan 9, 2026
cbe835f
[r] Conditionally remove aggregation of cell_suspensions.biomaterial_…
dsotirho-ucsc Jan 6, 2026
6876dc9
[r] Conditionally remove aggregation of cell_suspensions.document_id …
dsotirho-ucsc Jan 6, 2026
729d6bb
[r A] Conditionally remove aggregation of cell_lines.biomaterial_id i…
dsotirho-ucsc Jan 6, 2026
e7e4d9d
[r] Conditionally remove aggregation of cell_lines.document_id in HCA…
dsotirho-ucsc Jan 6, 2026
6f99b22
[r A] Conditionally remove aggregation of donors.biomaterial_id in HC…
dsotirho-ucsc Jan 6, 2026
18bf9eb
[r A] Conditionally remove aggregation of organoids.biomaterial_id in…
dsotirho-ucsc Jan 6, 2026
4bbbbda
[r] Conditionally remove aggregation of organoids.document_id in HCA …
dsotirho-ucsc Jan 6, 2026
0eb96a3
[r] Conditionally remove aggregation of sequencing_inputs.biomaterial…
dsotirho-ucsc Jan 6, 2026
27f6387
[r] Conditionally remove aggregation of sequencing_inputs.document_id…
dsotirho-ucsc Jan 6, 2026
9b6d24d
[r] Conditionally remove aggregation of sequencing_processes.document…
dsotirho-ucsc Jan 6, 2026
f6ff6b2
[r A] Conditionally remove aggregation of specimens.biomaterial_id in…
dsotirho-ucsc Jan 6, 2026
361b2e8
[r] Conditionally remove aggregation of specimens.document_id in HCA …
dsotirho-ucsc Jan 6, 2026
6e98dd5
[r] Conditionally remove aggregation of samples.biomaterial_id in HCA…
dsotirho-ucsc Jan 6, 2026
c627db0
[r] Conditionally remove aggregation of samples.document_id in HCA (#…
dsotirho-ucsc Jan 6, 2026
a3565af
[r] Conditionally remove aggregation of activities.activity_id in AnV…
dsotirho-ucsc Jan 7, 2026
89d97c7
[r] Conditionally remove aggregation of activities.document_id in AnV…
dsotirho-ucsc Jan 7, 2026
c68ff9d
[r] Conditionally remove aggregation of activities.source_datarepo_ro…
dsotirho-ucsc Jan 7, 2026
592670b
[r] Conditionally remove aggregation of biosamples.biosample_id in An…
dsotirho-ucsc Jan 7, 2026
904b6fe
[r] Conditionally remove aggregation of biosamples.document_id in AnV…
dsotirho-ucsc Jan 7, 2026
6ba7ddf
[r] Conditionally remove aggregation of biosamples.source_datarepo_ro…
dsotirho-ucsc Jan 7, 2026
0de0507
Document need for aggregation of datasets.document_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
ac5d8cd
[r] Remove aggregation of datasets.source_datarepo_row_ids in AnVIL (…
dsotirho-ucsc Jan 21, 2026
8a0c12f
[r] Conditionally remove aggregation of diagnoses.diagnosis_id in AnV…
dsotirho-ucsc Jan 7, 2026
083710d
[r] Conditionally remove aggregation of diagnoses.document_id in AnVI…
dsotirho-ucsc Jan 7, 2026
0dae10f
[r] Conditionally remove aggregation of diagnoses.source_datarepo_row…
dsotirho-ucsc Jan 7, 2026
3b59f00
[r] Conditionally remove aggregation of donors.document_id in AnVIL (…
dsotirho-ucsc Jan 7, 2026
6e5faa1
[r] Conditionally remove aggregation of donors.donor_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
955bdca
[r] Conditionally remove aggregation of donors.source_datarepo_row_id…
dsotirho-ucsc Jan 7, 2026
66a262f
[r] Remove aggregation of files.document_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
06cf22c
[r] Remove aggregation of files.drs_uri in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
74182a5
[r] Remove aggregation of files.file_id in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
f986a87
[r] Remove aggregation of files.file_md5sum in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
595b4ec
[r] Remove aggregation of files.file_name in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
d849810
[r] Remove aggregation of files.source_datarepo_row_ids in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
d7794b1
[r] Remove aggregation of files.version in AnVIL (#6793)
dsotirho-ucsc Jan 7, 2026
b816da3
[r] Increase accumulator limit for cell_suspensions.biomaterial_id an…
dsotirho-ucsc Jan 21, 2026
6f6a249
[r] Increase accumulator limit for donors.biomaterial_id in HCA (#6793)
dsotirho-ucsc Jan 21, 2026
b398f9b
[r] Increase accumulator limit for donors.development_stage in HCA (#…
dsotirho-ucsc Jan 7, 2026
91db4d5
[r] Increase accumulator limit for donors.document_id in HCA (#6793)
dsotirho-ucsc Jan 9, 2026
b53a53f
[r] Increase accumulator limit for donors.organism_age_range in HCA (…
dsotirho-ucsc Jan 7, 2026
a6f4dec
[r] Increase accumulator limit for donors.organism_age in HCA (#6793)
dsotirho-ucsc Jan 7, 2026
b3c4aa4
[r] Increase accumulator limit for matrices.file in HCA (#6793)
dsotirho-ucsc Jan 7, 2026
2fbbe34
[r] Increase accumulator limit for samples.biomaterial_id and documen…
dsotirho-ucsc Jan 20, 2026
ba29ce5
[r] Increase accumulator limit for sequencing_inputs.biomaterial_id a…
dsotirho-ucsc Jan 21, 2026
218d75d
[r] Increase accumulator limit for sequencing_processes.document_id i…
dsotirho-ucsc Jan 20, 2026
528c0a6
[r] Increase accumulator limit for specimens.biomaterial_id and docum…
dsotirho-ucsc Jan 20, 2026
b27bf7a
[1/3] Raise exception if aggregation drops values, add overflow option
dsotirho-ucsc Feb 27, 2026
980b0c1
[r 2/3] Raise exception if aggregation drops values, add overflow option
dsotirho-ucsc Dec 18, 2025
86763c9
[3/3] Raise exception if aggregation drops values, add overflow option
dsotirho-ucsc Feb 27, 2026
43b40bf
Add comment for need of using getitem()
dsotirho-ucsc Jan 30, 2026
fe0ba34
fixup! Add comment for need of using getitem()
dsotirho-ucsc Apr 3, 2026
7ec35f1
Add comment for file aggregate and replica documents relationship
dsotirho-ucsc Mar 4, 2026
325c415
fixup! Add comment for file aggregate and replica documents relationship
dsotirho-ucsc Mar 10, 2026
b7677c8
Refactor field exclusions from aggregators
dsotirho-ucsc Mar 3, 2026
b634592
fixup! Refactor field exclusions from aggregators
dsotirho-ucsc Apr 7, 2026
b011c59
Enforce that hot entity ids are fully accumulated (#6793)
dsotirho-ucsc Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# changes and reset the minor version to zero. Otherwise, increment only
# the minor version for backwards compatible changes. A backwards
# compatible change is one that does not require updates to clients.
'version': '16.1',
'version': '17.0',
'description': fd(f'''
# Overview

Expand Down
138 changes: 8 additions & 130 deletions lambdas/service/openapi.json

Large diffs are not rendered by default.

41 changes: 37 additions & 4 deletions src/azul/indexer/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,9 @@ class SetAccumulator[V: Hashable](Accumulator[V, list[V]]):

def __init__(self,
max_size: int | None = None,
key: Callable[[V], SupportsRichComparison] | None = None
key: Callable[[V], SupportsRichComparison] | None = None,
*,
allow_overflow: bool = False
) -> None:
"""
:param max_size: the maximum number of elements to retain
Expand All @@ -147,6 +149,7 @@ def __init__(self,
self.value: set[V] = set()
self.max_size = max_size
self.key = none_safe_key(none_last=True) if key is None else key
self.allow_overflow = allow_overflow

def accumulate(self, value: V | list[V]) -> int:
"""
Expand Down Expand Up @@ -553,9 +556,23 @@ def get(self) -> int:

class EntityAggregator(metaclass=ABCMeta):

def __init__(self, outer_entity_type: EntityType, entity_type: EntityType):
def __init__(self,
outer_entity_type: EntityType,
entity_type: EntityType,
strict: bool = False):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused by this. The aim of this PR is to make overflows an error, but the default for strict is False? PL please.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decided in PL to rename strict to is_hot.

"""
:param outer_entity_type: The entity type of the aggregate document.

:param entity_type: The entity type of the inner entities being
accumulated.

:param strict: Enforce complete accumulation of `document_id` for the
inner entity type. Required for "hot" entity types, whose
replicas don't track hub IDs.
"""
self.outer_entity_type = outer_entity_type
self.entity_type = entity_type
self.strict = strict

def _transform_entity(self, entity: JSON) -> JSON:
return entity
Expand Down Expand Up @@ -600,13 +617,29 @@ def _accumulate(self, aggregate: Aggregate, entity: JSON) -> None:
accumulator.accumulate(value)

def _aggregate(self, aggregate: Aggregate) -> JSON:
if self.strict:
accumulator = aggregate.get('document_id')
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Conflates absence and None value. One of them represents a violation of the "hot" requirement, the other is a genuine, unexpected bug.

assert accumulator is not None, R(
'Hot entity types must always accumulate document_id',
self.entity_type, aggregate.keys()
)
assert not (isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow), R(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should pull the allow_overflow attribute up into the base class, as a class attribute, initialized to False. Then you won't need the type check to guard the attribute access.

The constructor argument can remain in SetAccumulator.

'allow_overflow is not permitted when accumulating document_id '
'in hot entity types', self.entity_type
)
result = {}
for k, accumulator in aggregate.items():
if accumulator is not None:
result[k] = accumulator.get()
if accumulator.dropped > 0:
log.warning('Values were dropped %d times while aggregating %s.%s into %s',
accumulator.dropped, self.entity_type, k, self.outer_entity_type)
message = (
f'Values were dropped {accumulator.dropped} times while aggregating '
f'{self.entity_type}.{k} into {self.outer_entity_type}'
)
if isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow:
log.warning(message)
else:
assert False, R(message)
return result


Expand Down
102 changes: 91 additions & 11 deletions src/azul/plugins/metadata/anvil/indexer/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Accumulator,
DistinctAccumulator,
GroupingAggregator,
SetAccumulator,
SetOfDictAccumulator,
SimpleAggregator,
SumAccumulator,
Expand All @@ -22,14 +23,49 @@
)


class ActivityAggregator(SimpleAggregator):
pass
class AnVILEntityAggregator(SimpleAggregator):

def _never_accumulate(self) -> set[str]:
entity_type = self.entity_type
if entity_type == 'activities':
entity_type = 'activity'
elif entity_type == 'diagnoses':
entity_type = 'diagnosis'
else:
assert entity_type.endswith('s')
entity_type = entity_type[:-1]
return {
entity_type + '_id',
'document_id',
'source_datarepo_row_ids'
}


class ActivityAggregator(AnVILEntityAggregator):

def _accumulator(self, field: str) -> Accumulator | None:
if (
field in self._never_accumulate()
and self.outer_entity_type != 'files'
):
# These fields are only aggregated for files, where they are needed
# for compact and PFB manifests
return None
else:
return super()._accumulator(field)


class BiosampleAggregator(SimpleAggregator):
class BiosampleAggregator(AnVILEntityAggregator):

def _accumulator(self, field: str) -> Accumulator | None:
if field == 'donor_age_at_collection':
if (
field in self._never_accumulate()
and self.outer_entity_type != 'files'
):
# These fields are only aggregated for files, where they are needed
# for compact and PFB manifests
return None
elif field == 'donor_age_at_collection':
return SetOfDictAccumulator(max_size=100,
key=compose_keys(none_safe_tuple_key(none_last=True),
itemgetter('lte', 'gte')))
Expand All @@ -38,25 +74,62 @@ def _accumulator(self, field: str) -> Accumulator | None:


class DatasetAggregator(SimpleAggregator):
pass

def _accumulator(self, field: str) -> Accumulator | None:
if field == 'document_id':
# If any dataset IDs are missing from the aggregate, those datasets
# will be omitted during the verbatim handover. Datasets are a "hot"
# entity type, and we can't track their hubs in replica documents,
# so we rely on the inner entity IDs instead. We also need to
# aggregate document_id to allow filtering by the value on
# non-dataset endpoints.
return super()._accumulator(field)
elif field == 'source_datarepo_row_ids' and self.outer_entity_type != 'files':
# These fields are only aggregated for files, where they are needed
# for compact and PFB manifests
return None
else:
return super()._accumulator(field)


class DiagnosisAggregator(SimpleAggregator):
class DiagnosisAggregator(AnVILEntityAggregator):

def _accumulator(self, field: str) -> Accumulator | None:
if field in ('diagnosis_age', 'onset_age'):
if (
field in self._never_accumulate()
and self.outer_entity_type != 'files'
):
# These fields are only aggregated for files, where they are needed
# for compact and PFB manifests
return None
elif field in ('diagnosis_age', 'onset_age'):
return SetOfDictAccumulator(max_size=100,
key=compose_keys(none_safe_tuple_key(none_last=True),
itemgetter('lte', 'gte')))
elif field == 'disease':
return SetAccumulator(max_size=100,
# Some AnVIL datasets have excessive numbers
# of disease values, all being accessions.
allow_overflow=self.outer_entity_type == 'datasets')
else:
return super()._accumulator(field)


class DonorAggregator(SimpleAggregator):
pass
class DonorAggregator(AnVILEntityAggregator):

def _accumulator(self, field: str) -> Accumulator | None:
if (
field in self._never_accumulate()
and self.outer_entity_type != 'files'
):
# These fields are only aggregated for files, where they are needed
# for compact and PFB manifests
return None
else:
return super()._accumulator(field)


class FileAggregator(GroupingAggregator):
class FileAggregator(AnVILEntityAggregator, GroupingAggregator):

def _transform_entity(self, entity: JSON) -> JSON:
file_aggregate_fields = {
Expand All @@ -72,7 +145,14 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
return entity['file_format'],

def _accumulator(self, field: str) -> Accumulator | None:
if field in ('count', 'file_size'):
if field in self._never_accumulate() | {
'drs_uri',
'file_md5sum',
'file_name',
'version'
}:
return None
elif field in ('count', 'file_size'):
return DistinctAccumulator(SumAccumulator())
else:
return super()._accumulator(field)
4 changes: 3 additions & 1 deletion src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ def aggregator(cls, entity_type) -> EntityAggregator:
agg_cls = FileAggregator
else:
assert False, entity_type
return agg_cls(cls.entity_type(), entity_type)
return agg_cls(cls.entity_type(), entity_type,
strict=(issubclass(cls, ReplicaTransformer)
and entity_type in cls.hot_entity_types().values()))

def estimate(self, partition: BundlePartition) -> int:
# Orphans are not considered when deciding whether to partition the
Expand Down
3 changes: 1 addition & 2 deletions src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]:
files=Sorting(field_name='fileName'),
projects=Sorting(field_name='projectTitle',
max_page_size=75),
samples=Sorting(field_name='sampleId')
samples=Sorting(field_name='entryId')
)

@property
Expand Down Expand Up @@ -279,7 +279,6 @@ def _field_mapping(self) -> InverseFieldMapping:
'donor_count': 'donorCount'
},
'samples': {
'biomaterial_id': 'sampleId',
'entity_type': 'sampleEntityType',
'organ': 'organ',
'organ_part': 'organPart',
Expand Down
Loading
Loading