-
Notifications
You must be signed in to change notification settings - Fork 4
Add support for HCA tissue atlas (#7128) #7877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
1065984
68d8af0
a3f15bb
8b17022
aabb30f
8b5b645
5b5526f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,9 @@ | |
| one, | ||
| ) | ||
|
|
||
| from azul.field_type import ( | ||
| Nested, | ||
| ) | ||
| from azul.lib import ( | ||
| cached_property, | ||
| ) | ||
|
|
@@ -558,9 +561,11 @@ def file_type_summary(aggregate_file: JSON) -> FileTypeSummaryForHit: | |
| ] | ||
| return summarized_hit | ||
|
|
||
| def make_terms(self, agg) -> Terms: | ||
| def choose_entry(_term): | ||
| if 'key_as_string' in _term: | ||
| def make_terms(self, field_type, agg) -> Terms: | ||
| def choose_entry(_term, nested_keys): | ||
| if nested_keys is not None: | ||
| return dict(zip(nested_keys, _term['key'])) | ||
| elif 'key_as_string' in _term: | ||
| return _term['key_as_string'] | ||
| elif (term_key := _term['key']) is None: | ||
| return None | ||
|
|
@@ -571,10 +576,15 @@ def choose_entry(_term): | |
| else: | ||
| return str(term_key) | ||
|
|
||
| if isinstance(field_type, Nested): | ||
| nested_keys = [path[-1] for path in agg['myTerms']['meta']['paths']] | ||
| else: | ||
| nested_keys = None | ||
| terms: list[Term] = [] | ||
| for bucket in agg['myTerms']['buckets']: | ||
| term = Term(term=choose_entry(bucket), | ||
| count=bucket['doc_count']) | ||
| doc_count = bucket['doc_count'] | ||
| term = Term(term=choose_entry(bucket, nested_keys), | ||
| count=doc_count) | ||
| try: | ||
| sub_agg = bucket['myProjectIds'] | ||
| except KeyError: | ||
|
|
@@ -605,8 +615,9 @@ def choose_entry(_term): | |
| type='terms') | ||
|
|
||
| def make_facets(self, aggs: JSON) -> dict[str, Terms]: | ||
| field_types = self.service.field_types_by_name(self.catalog) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please familiarize yourself with what the "Pull up method" refactoring does. This change is not that. |
||
| facets = {} | ||
| for facet, agg in aggs.items(): | ||
| if facet != '_project_agg': # Filter out project specific aggs | ||
| facets[facet] = self.make_terms(agg) | ||
| facets[facet] = self.make_terms(field_types[facet], agg) | ||
| return facets | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,7 @@ | |
| ) | ||
| from opensearchpy.helpers.aggs import ( | ||
| Agg, | ||
| MultiTerms, | ||
| Terms, | ||
| ) | ||
| from opensearchpy.helpers.query import ( | ||
|
|
@@ -327,21 +328,35 @@ def _prepare_aggregation(self, *, facet: str, facet_path: FieldPath) -> Agg: | |
|
|
||
| field_type = self.service.field_type(self.catalog, facet_path) | ||
| if isinstance(field_type, Nested): | ||
| nested_agg = agg.bucket(name='nested', | ||
| agg_type='nested', | ||
| path=dotted(facet_path)) | ||
| facet_path = dotted(facet_path, field_type.agg_property) | ||
| path = dotted(facet_path) | ||
| # A nested aggregation to aggregate on fields inside a nested field | ||
| agg.bucket(name='nested', | ||
| agg_type='nested', | ||
| path=path) | ||
| # A multi-terms aggregation to form composite keys made from the | ||
| # fields inside a nested field | ||
| agg.aggs.nested.bucket(name='myTerms', | ||
| agg_type='multi_terms', | ||
| terms=[ | ||
| {'field': path + f'.{field}.keyword'} | ||
| for field in field_type.properties | ||
| ], | ||
| size=config.terms_aggregation_size) | ||
| # A filter aggregation to work around that we can't use a missing | ||
| # aggregation with a nested field. | ||
| # See https://github.com/elastic/elasticsearch/issues/9571 | ||
| agg.bucket(name='untagged', | ||
| agg_type='filter', | ||
| filter=Q('bool', must_not=[ | ||
| Q('nested', path=path, query=Q('exists', field=path)) | ||
| ])) | ||
| else: | ||
| nested_agg = agg | ||
| # Make an inner agg that will contain the terms in question | ||
| path = dotted(facet_path, 'keyword') | ||
| # FIXME: Approximation errors for terms aggregation are unchecked | ||
| # https://github.com/DataBiosphere/azul/issues/3413 | ||
| nested_agg.bucket(name='myTerms', | ||
| agg_type='terms', | ||
| field=path, | ||
| size=config.terms_aggregation_size) | ||
| nested_agg.bucket('untagged', 'missing', field=path) | ||
| path = dotted(facet_path, 'keyword') | ||
| agg.bucket(name='myTerms', | ||
| agg_type='terms', | ||
| field=path, | ||
| size=config.terms_aggregation_size) | ||
| agg.bucket('untagged', 'missing', field=path) | ||
| return agg | ||
|
|
||
| def _annotate_aggs_for_translation(self, request: Search): | ||
|
|
@@ -352,13 +367,20 @@ def _annotate_aggs_for_translation(self, request: Search): | |
| """ | ||
|
|
||
| def annotate(agg: Agg): | ||
| if isinstance(agg, Terms): | ||
| path = agg.field.split('.') | ||
| if path[-1] == 'keyword': | ||
| path.pop() | ||
| if isinstance(agg, (Terms, MultiTerms)): | ||
| if not hasattr(agg, 'meta'): | ||
| agg.meta = {} | ||
| agg.meta['path'] = path | ||
| if hasattr(agg, 'terms'): | ||
| # A MultiTerms agg contains multiple fields, and we need the | ||
| # path of each one. We store the paths in the same order the | ||
| # fields occur in the `terms` list. | ||
| agg.meta['paths'] = [] | ||
| for term in agg.terms: | ||
| path = term['field'].removesuffix('.keyword').split('.') | ||
| agg.meta['paths'].append(path) | ||
| else: | ||
| path = agg.field.removesuffix('.keyword').split('.') | ||
| agg.meta['path'] = path | ||
| if hasattr(agg, 'aggs'): | ||
| subs = agg.aggs | ||
| for sub_name in subs: | ||
|
|
@@ -391,6 +413,8 @@ def translate(k, v: MutableJSON): | |
| translate(k, v) | ||
| else: | ||
| try: | ||
| # We annotated Terms aggregates with `path`, a dotted path | ||
| # to a field in an index document | ||
| path = v['meta']['path'] | ||
| except KeyError: | ||
| pass | ||
|
|
@@ -399,6 +423,19 @@ def translate(k, v: MutableJSON): | |
| for bucket in buckets: | ||
| bucket['key'] = field_type.from_index(bucket['key']) | ||
| translate(k, bucket) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PL to explain why
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was mistaken in thinking that this is no longer needed. We still have occurrences of nested agg buckets such as in the HCA agg
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please don't ignore my PL request.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Decided in PL to refactor the changes to the translation of aggregates ( |
||
| try: | ||
| # We annotated MultiTerms aggregates with `paths`, a list of | ||
| # dotted paths for the fields inside a nested field | ||
| paths = v['meta']['paths'] | ||
| except KeyError: | ||
| pass | ||
| else: | ||
| for i, path in enumerate(paths): | ||
| field_type = self.service.field_type(self.catalog, tuple(path)) | ||
| for bucket in buckets: | ||
| bucket['key'][i] = field_type.from_index(bucket['key'][i]) | ||
| for bucket in buckets: | ||
| translate(k, bucket) | ||
|
|
||
| for k, v in aggs.items(): | ||
| translate(k, v) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment was generated by Claude Code.
With these two overrides, does it still make sense for
Nestedto inheritPassThrough?The defining characteristic of
PassThroughis thatto_indexandfrom_indexare identity operations. After this change,Nestedoverrides both with non-trivial logic that delegates to per-property field types — the values are no longer "passed through" unchanged.What
Nestedstill gets fromPassThrough:es_typeproperty (trivial — stores and returns_es_type)allow_sorting_by_empty_lists = False__init__convenience of setting bothnative_formandindex_formto the same type (JSON)All three are easy to replicate by inheriting
FieldType[JSON, JSON]directly.