DataBiosphere · dsotirho-ucsc · Mar 10, 2026 · Jan 5, 2026 · Jan 9, 2026 · Jan 6, 2026
@@ -55,7 +55,7 @@
         # changes and reset the minor version to zero. Otherwise, increment only
         # the minor version for backwards compatible changes. A backwards
         # compatible change is one that does not require updates to clients.
-        'version': '16.1',
+        'version': '17.0',
         'description': fd(f'''
             # Overview
 

@@ -132,7 +132,9 @@ class SetAccumulator[V: Hashable](Accumulator[V, list[V]]):
 
     def __init__(self,
                  max_size: int | None = None,
-                 key: Callable[[V], SupportsRichComparison] | None = None
+                 key: Callable[[V], SupportsRichComparison] | None = None,
+                 *,
+                 allow_overflow: bool = False
                  ) -> None:
         """
         :param max_size: the maximum number of elements to retain
@@ -147,6 +149,7 @@ def __init__(self,
         self.value: set[V] = set()
         self.max_size = max_size
         self.key = none_safe_key(none_last=True) if key is None else key
+        self.allow_overflow = allow_overflow
 
     def accumulate(self, value: V | list[V]) -> int:
         """
@@ -553,9 +556,23 @@ def get(self) -> int:
 
 class EntityAggregator(metaclass=ABCMeta):
 
-    def __init__(self, outer_entity_type: EntityType, entity_type: EntityType):
+    def __init__(self,
+                 outer_entity_type: EntityType,
+                 entity_type: EntityType,
+                 strict: bool = False):
+        """
+        :param outer_entity_type: The entity type of the aggregate document.
+
+        :param entity_type: The entity type of the inner entities being
+                            accumulated.
+
+        :param strict: Enforce complete accumulation of `document_id` for the
+                       inner entity type. Required for "hot" entity types, whose
+                       replicas don't track hub IDs.
+        """
         self.outer_entity_type = outer_entity_type
         self.entity_type = entity_type
+        self.strict = strict
 
     def _transform_entity(self, entity: JSON) -> JSON:
         return entity
@@ -600,13 +617,29 @@ def _accumulate(self, aggregate: Aggregate, entity: JSON) -> None:
                 accumulator.accumulate(value)
 
     def _aggregate(self, aggregate: Aggregate) -> JSON:
+        if self.strict:
+            accumulator = aggregate.get('document_id')
+            assert accumulator is not None, R(
+                'Hot entity types must always accumulate document_id',
+                self.entity_type, aggregate.keys()
+            )
+            assert not (isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow), R(
+                'allow_overflow is not permitted when accumulating document_id '
+                'in hot entity types', self.entity_type
+            )
         result = {}
         for k, accumulator in aggregate.items():
             if accumulator is not None:
                 result[k] = accumulator.get()
                 if accumulator.dropped > 0:
-                    log.warning('Values were dropped %d times while aggregating %s.%s into %s',
-                                accumulator.dropped, self.entity_type, k, self.outer_entity_type)
+                    message = (
+                        f'Values were dropped {accumulator.dropped} times while aggregating '
+                        f'{self.entity_type}.{k} into {self.outer_entity_type}'
+                    )
+                    if isinstance(accumulator, SetAccumulator) and accumulator.allow_overflow:
+                        log.warning(message)
+                    else:
+                        assert False, R(message)
         return result
 
 

@@ -9,6 +9,7 @@
     Accumulator,
     DistinctAccumulator,
     GroupingAggregator,
+    SetAccumulator,
     SetOfDictAccumulator,
     SimpleAggregator,
     SumAccumulator,
@@ -22,14 +23,49 @@
 )
 
 
-class ActivityAggregator(SimpleAggregator):
-    pass
+class AnVILEntityAggregator(SimpleAggregator):
+
+    def _never_accumulate(self) -> set[str]:
+        entity_type = self.entity_type
+        if entity_type == 'activities':
+            entity_type = 'activity'
+        elif entity_type == 'diagnoses':
+            entity_type = 'diagnosis'
+        else:
+            assert entity_type.endswith('s')
+            entity_type = entity_type[:-1]
+        return {
+            entity_type + '_id',
+            'document_id',
+            'source_datarepo_row_ids'
+        }
+
+
+class ActivityAggregator(AnVILEntityAggregator):
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if (
+            field in self._never_accumulate()
+            and self.outer_entity_type != 'files'
+        ):
+            # These fields are only aggregated for files, where they are needed
+            # for compact and PFB manifests
+            return None
+        else:
+            return super()._accumulator(field)
 
 
-class BiosampleAggregator(SimpleAggregator):
+class BiosampleAggregator(AnVILEntityAggregator):
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field == 'donor_age_at_collection':
+        if (
+            field in self._never_accumulate()
+            and self.outer_entity_type != 'files'
+        ):
+            # These fields are only aggregated for files, where they are needed
+            # for compact and PFB manifests
+            return None
+        elif field == 'donor_age_at_collection':
             return SetOfDictAccumulator(max_size=100,
                                         key=compose_keys(none_safe_tuple_key(none_last=True),
                                                          itemgetter('lte', 'gte')))
@@ -38,25 +74,62 @@ def _accumulator(self, field: str) -> Accumulator | None:
 
 
 class DatasetAggregator(SimpleAggregator):
-    pass
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if field == 'document_id':
+            # If any dataset IDs are missing from the aggregate, those datasets
+            # will be omitted during the verbatim handover. Datasets are a "hot"
+            # entity type, and we can't track their hubs in replica documents,
+            # so we rely on the inner entity IDs instead. We also need to
+            # aggregate document_id to allow filtering by the value on
+            # non-dataset endpoints.
+            return super()._accumulator(field)
+        elif field == 'source_datarepo_row_ids' and self.outer_entity_type != 'files':
+            # These fields are only aggregated for files, where they are needed
+            # for compact and PFB manifests
+            return None
+        else:
+            return super()._accumulator(field)
 
 
-class DiagnosisAggregator(SimpleAggregator):
+class DiagnosisAggregator(AnVILEntityAggregator):
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field in ('diagnosis_age', 'onset_age'):
+        if (
+            field in self._never_accumulate()
+            and self.outer_entity_type != 'files'
+        ):
+            # These fields are only aggregated for files, where they are needed
+            # for compact and PFB manifests
+            return None
+        elif field in ('diagnosis_age', 'onset_age'):
             return SetOfDictAccumulator(max_size=100,
                                         key=compose_keys(none_safe_tuple_key(none_last=True),
                                                          itemgetter('lte', 'gte')))
+        elif field == 'disease':
+            return SetAccumulator(max_size=100,
+                                  # Some AnVIL datasets have excessive numbers
+                                  # of disease values, all being accessions.
+                                  allow_overflow=self.outer_entity_type == 'datasets')
         else:
             return super()._accumulator(field)
 
 
-class DonorAggregator(SimpleAggregator):
-    pass
+class DonorAggregator(AnVILEntityAggregator):
+
+    def _accumulator(self, field: str) -> Accumulator | None:
+        if (
+            field in self._never_accumulate()
+            and self.outer_entity_type != 'files'
+        ):
+            # These fields are only aggregated for files, where they are needed
+            # for compact and PFB manifests
+            return None
+        else:
+            return super()._accumulator(field)
 
 
-class FileAggregator(GroupingAggregator):
+class FileAggregator(AnVILEntityAggregator, GroupingAggregator):
 
     def _transform_entity(self, entity: JSON) -> JSON:
         file_aggregate_fields = {
@@ -72,7 +145,14 @@ def _group_keys(self, entity) -> tuple[Any, ...]:
         return entity['file_format'],
 
     def _accumulator(self, field: str) -> Accumulator | None:
-        if field in ('count', 'file_size'):
+        if field in self._never_accumulate() | {
+            'drs_uri',
+            'file_md5sum',
+            'file_name',
+            'version'
+        }:
+            return None
+        elif field in ('count', 'file_size'):
             return DistinctAccumulator(SumAccumulator())
         else:
             return super()._accumulator(field)
@@ -181,7 +181,9 @@ def aggregator(cls, entity_type) -> EntityAggregator:
             agg_cls = FileAggregator
         else:
             assert False, entity_type
-        return agg_cls(cls.entity_type(), entity_type)
+        return agg_cls(cls.entity_type(), entity_type,
+                       strict=(issubclass(cls, ReplicaTransformer)
+                               and entity_type in cls.hot_entity_types().values()))
 
     def estimate(self, partition: BundlePartition) -> int:
         # Orphans are not considered when deciding whether to partition the

@@ -186,7 +186,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]:
             files=Sorting(field_name='fileName'),
             projects=Sorting(field_name='projectTitle',
                              max_page_size=75),
-            samples=Sorting(field_name='sampleId')
+            samples=Sorting(field_name='entryId')
         )
 
     @property
@@ -279,7 +279,6 @@ def _field_mapping(self) -> InverseFieldMapping:
                     'donor_count': 'donorCount'
                 },
                 'samples': {
-                    'biomaterial_id': 'sampleId',
                     'entity_type': 'sampleEntityType',
                     'organ': 'organ',
                     'organ_part': 'organPart',