From abea9570fccbe27b5cbc021c5264b14e999084a7 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Tue, 20 Jan 2026 17:07:22 -0800 Subject: [PATCH 01/15] Fix conflation of absent/null properties in mirror documentation (#7637) --- docs/mirror.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/mirror.rst b/docs/mirror.rst index 93236369f4..b1d3c51629 100644 --- a/docs/mirror.rst +++ b/docs/mirror.rst @@ -160,9 +160,9 @@ response from that endpoint contains the file's mirror URI at ``hits[].files[].azul_mirror_uri``. The mirror URI is of the form ``s3://${bucket}/file/${digest_value}.${digest_type}`` where ``digest_type`` and ``digest_value`` denote the primary digest of the file. If the property -``azul_mirror_uri`` is absent from the Azul response, the mirror will not -include that file. If the response property is present, the mirror will very -likely include the file. +``azul_mirror_uri`` is null in the Azul response, the mirror will not include +that file. If the response property is non-null, the mirror will very likely +include the file. .. [8] https://service.azul.data.humancellatlas.org/ .. [9] https://service.explore.anvilproject.org/ From 552037ffb8277f3f1d523d8fa24a3df125402dfe Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 21 Jan 2026 17:32:04 -0800 Subject: [PATCH 02/15] Rename source service test --- test/service/{test_source_cache.py => test_source_service.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/service/{test_source_cache.py => test_source_service.py} (100%) diff --git a/test/service/test_source_cache.py b/test/service/test_source_service.py similarity index 100% rename from test/service/test_source_cache.py rename to test/service/test_source_service.py From 65d81966337227a57be1d959f39723bb2ac0231d Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 21 Jan 2026 12:13:51 -0800 Subject: [PATCH 03/15] Reorganize static resources --- lambdas/indexer/vendor/resources/static/schemas | 2 +- lambdas/indexer/vendor/resources/static/swagger | 2 +- lambdas/service/vendor/resources/static/swagger | 2 +- {schemas => resources/static/schemas}/mirror/info/v1.json | 0 {schemas => resources/static/schemas}/mirror/info/v2.json | 0 {swagger => resources/static/swagger}/index.css | 0 {swagger => resources/static/swagger}/index.html | 0 {swagger => resources/static/swagger}/oauth2-redirect.html | 0 {swagger => resources/static/swagger}/oauth2-redirect.js | 0 {swagger => resources/static/swagger}/swagger-initializer.js | 0 .../static/swagger}/swagger-initializer.js.template.mustache | 0 {swagger => resources/static/swagger}/swagger-ui-bundle.js | 0 .../static/swagger}/swagger-ui-standalone-preset.js | 0 {swagger => resources/static/swagger}/swagger-ui.css | 0 scripts/update_swagger.py | 2 +- 15 files changed, 4 insertions(+), 4 deletions(-) rename {schemas => resources/static/schemas}/mirror/info/v1.json (100%) rename {schemas => resources/static/schemas}/mirror/info/v2.json (100%) rename {swagger => resources/static/swagger}/index.css (100%) rename {swagger => resources/static/swagger}/index.html (100%) rename {swagger => resources/static/swagger}/oauth2-redirect.html (100%) rename {swagger => resources/static/swagger}/oauth2-redirect.js (100%) rename {swagger => resources/static/swagger}/swagger-initializer.js (100%) rename {swagger => resources/static/swagger}/swagger-initializer.js.template.mustache (100%) rename {swagger => resources/static/swagger}/swagger-ui-bundle.js (100%) rename {swagger => resources/static/swagger}/swagger-ui-standalone-preset.js (100%) rename {swagger => resources/static/swagger}/swagger-ui.css (100%) diff --git a/lambdas/indexer/vendor/resources/static/schemas b/lambdas/indexer/vendor/resources/static/schemas index fd8289d0d9..b91abae08d 120000 --- a/lambdas/indexer/vendor/resources/static/schemas +++ b/lambdas/indexer/vendor/resources/static/schemas @@ -1 +1 @@ -../../../../../schemas \ No newline at end of file +../../../../../resources/static/schemas/ \ No newline at end of file diff --git a/lambdas/indexer/vendor/resources/static/swagger b/lambdas/indexer/vendor/resources/static/swagger index 7c782ec5ff..c30f07fa0d 120000 --- a/lambdas/indexer/vendor/resources/static/swagger +++ b/lambdas/indexer/vendor/resources/static/swagger @@ -1 +1 @@ -../../../../../swagger/ \ No newline at end of file +../../../../../resources/static/swagger/ \ No newline at end of file diff --git a/lambdas/service/vendor/resources/static/swagger b/lambdas/service/vendor/resources/static/swagger index 7c782ec5ff..c30f07fa0d 120000 --- a/lambdas/service/vendor/resources/static/swagger +++ b/lambdas/service/vendor/resources/static/swagger @@ -1 +1 @@ -../../../../../swagger/ \ No newline at end of file +../../../../../resources/static/swagger/ \ No newline at end of file diff --git a/schemas/mirror/info/v1.json b/resources/static/schemas/mirror/info/v1.json similarity index 100% rename from schemas/mirror/info/v1.json rename to resources/static/schemas/mirror/info/v1.json diff --git a/schemas/mirror/info/v2.json b/resources/static/schemas/mirror/info/v2.json similarity index 100% rename from schemas/mirror/info/v2.json rename to resources/static/schemas/mirror/info/v2.json diff --git a/swagger/index.css b/resources/static/swagger/index.css similarity index 100% rename from swagger/index.css rename to resources/static/swagger/index.css diff --git a/swagger/index.html b/resources/static/swagger/index.html similarity index 100% rename from swagger/index.html rename to resources/static/swagger/index.html diff --git a/swagger/oauth2-redirect.html b/resources/static/swagger/oauth2-redirect.html similarity index 100% rename from swagger/oauth2-redirect.html rename to resources/static/swagger/oauth2-redirect.html diff --git a/swagger/oauth2-redirect.js b/resources/static/swagger/oauth2-redirect.js similarity index 100% rename from swagger/oauth2-redirect.js rename to resources/static/swagger/oauth2-redirect.js diff --git a/swagger/swagger-initializer.js b/resources/static/swagger/swagger-initializer.js similarity index 100% rename from swagger/swagger-initializer.js rename to resources/static/swagger/swagger-initializer.js diff --git a/swagger/swagger-initializer.js.template.mustache b/resources/static/swagger/swagger-initializer.js.template.mustache similarity index 100% rename from swagger/swagger-initializer.js.template.mustache rename to resources/static/swagger/swagger-initializer.js.template.mustache diff --git a/swagger/swagger-ui-bundle.js b/resources/static/swagger/swagger-ui-bundle.js similarity index 100% rename from swagger/swagger-ui-bundle.js rename to resources/static/swagger/swagger-ui-bundle.js diff --git a/swagger/swagger-ui-standalone-preset.js b/resources/static/swagger/swagger-ui-standalone-preset.js similarity index 100% rename from swagger/swagger-ui-standalone-preset.js rename to resources/static/swagger/swagger-ui-standalone-preset.js diff --git a/swagger/swagger-ui.css b/resources/static/swagger/swagger-ui.css similarity index 100% rename from swagger/swagger-ui.css rename to resources/static/swagger/swagger-ui.css diff --git a/scripts/update_swagger.py b/scripts/update_swagger.py index dbb5dfbd56..f183c8dd42 100644 --- a/scripts/update_swagger.py +++ b/scripts/update_swagger.py @@ -36,7 +36,7 @@ 'swagger-initializer.js' ] -swagger_dir = Path(config.project_root) / 'swagger' +swagger_dir = Path(config.project_root) / 'resources/static/swagger' def download_file(name: str): From e8315b344934cc5a57eae9d08b0466ee45686792 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 21 Jan 2026 12:17:05 -0800 Subject: [PATCH 04/15] Deduplicate outsourced environ template file --- .../indexer/vendor/resources/environ.json.template.py | 9 +-------- .../service/vendor/resources/environ.json.template.py | 9 +-------- resources/environ.json.template.py | 8 ++++++++ 3 files changed, 10 insertions(+), 16 deletions(-) mode change 100644 => 120000 lambdas/indexer/vendor/resources/environ.json.template.py mode change 100644 => 120000 lambdas/service/vendor/resources/environ.json.template.py create mode 100644 resources/environ.json.template.py diff --git a/lambdas/indexer/vendor/resources/environ.json.template.py b/lambdas/indexer/vendor/resources/environ.json.template.py deleted file mode 100644 index f77298a3aa..0000000000 --- a/lambdas/indexer/vendor/resources/environ.json.template.py +++ /dev/null @@ -1,8 +0,0 @@ -from azul import ( - config, -) -from azul.template import ( - emit, -) - -emit(config.lambda_env_for_outsourcing) diff --git a/lambdas/indexer/vendor/resources/environ.json.template.py b/lambdas/indexer/vendor/resources/environ.json.template.py new file mode 120000 index 0000000000..aa5222b10e --- /dev/null +++ b/lambdas/indexer/vendor/resources/environ.json.template.py @@ -0,0 +1 @@ +../../../../resources/environ.json.template.py \ No newline at end of file diff --git a/lambdas/service/vendor/resources/environ.json.template.py b/lambdas/service/vendor/resources/environ.json.template.py deleted file mode 100644 index f77298a3aa..0000000000 --- a/lambdas/service/vendor/resources/environ.json.template.py +++ /dev/null @@ -1,8 +0,0 @@ -from azul import ( - config, -) -from azul.template import ( - emit, -) - -emit(config.lambda_env_for_outsourcing) diff --git a/lambdas/service/vendor/resources/environ.json.template.py b/lambdas/service/vendor/resources/environ.json.template.py new file mode 120000 index 0000000000..aa5222b10e --- /dev/null +++ b/lambdas/service/vendor/resources/environ.json.template.py @@ -0,0 +1 @@ +../../../../resources/environ.json.template.py \ No newline at end of file diff --git a/resources/environ.json.template.py b/resources/environ.json.template.py new file mode 100644 index 0000000000..f77298a3aa --- /dev/null +++ b/resources/environ.json.template.py @@ -0,0 +1,8 @@ +from azul import ( + config, +) +from azul.template import ( + emit, +) + +emit(config.lambda_env_for_outsourcing) From 9ddb342d75b761556f37299c75a96bac43dd4501 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 21 Jan 2026 17:31:18 -0800 Subject: [PATCH 05/15] [R] Cover resource templates with pep8 --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 122e18ffee..4368384205 100644 --- a/Makefile +++ b/Makefile @@ -207,6 +207,7 @@ absolute_sources = $(shell echo $(project_root)/src \ $$(find $(project_root)/terraform{,/gitlab,/shared,/browser} \ $(project_root)/lambdas/{indexer,service}{,/.chalice} \ $(project_root)/.github \ + $(project_root)/resources \ -maxdepth 1 \ -name '*.template.py' \ -type f )) From 76c02e44d96cc9a098cf0f2522132af7c866aa14 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Mon, 26 Jan 2026 15:56:00 -0800 Subject: [PATCH 06/15] Inline method --- src/azul/indexer/mirror_service.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py index 96a26497ba..59f293aeb3 100644 --- a/src/azul/indexer/mirror_service.py +++ b/src/azul/indexer/mirror_service.py @@ -547,7 +547,9 @@ def _mirror(self, a: MirrorAction): @_mirror.register def _(self, a: MirrorSourceAction) -> Iterator[MirrorAction]: - assert a.source.id in self._list_public_source_ids(), R( + public_sources = self._source_service.list_source_ids(self.catalog, + authentication=None) + assert a.source.id in public_sources, R( 'Cannot mirror non-public source', a.source) plugin = self._repository_plugin # The desired partition size depends on the maximum number of messages @@ -572,9 +574,6 @@ def _(self, a: MirrorSourceAction) -> Iterator[MirrorAction]: for partition in prefix.partition_prefixes(): yield devolve(MirrorPartitionAction, a, source=source, prefix=partition) - def _list_public_source_ids(self) -> set[str]: - return self._source_service.list_source_ids(self.catalog, authentication=None) - @_mirror.register def _(self, a: MirrorPartitionAction) -> Iterator[MirrorAction]: plugin = self._repository_plugin From 35e7cf24588204b20cced0a0505144a814539a08 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 28 Jan 2026 14:31:08 -0800 Subject: [PATCH 07/15] Rename list_source[_id]s to list_accessible_source[_id]s --- scripts/mirror.py | 2 +- src/azul/indexer/mirror_service.py | 4 ++-- src/azul/plugins/__init__.py | 16 +++++++-------- .../plugins/repository/canned/__init__.py | 6 +++--- src/azul/plugins/repository/dss/__init__.py | 6 +++--- src/azul/plugins/repository/tdr.py | 12 +++++------ src/azul/service/source_controller.py | 5 +++-- src/azul/service/source_service.py | 20 +++++++++---------- test/indexer/test_mirror_controller.py | 2 +- test/service/test_response.py | 3 ++- 10 files changed, 39 insertions(+), 37 deletions(-) diff --git a/scripts/mirror.py b/scripts/mirror.py index ec76508c16..ff66008b1e 100644 --- a/scripts/mirror.py +++ b/scripts/mirror.py @@ -37,7 +37,7 @@ def mirror_catalog(azul: AzulClient, fail_queue) public_sources_by_spec = { source.spec: source - for source in plugin.list_sources(authentication=None) + for source in plugin.list_accessible_sources(authentication=None) } # When the user doesn't specify a source or provides "*" as a source glob, # we implicitly filter out managed-access sources. This lets us assert that diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py index 59f293aeb3..2ca6294740 100644 --- a/src/azul/indexer/mirror_service.py +++ b/src/azul/indexer/mirror_service.py @@ -547,8 +547,8 @@ def _mirror(self, a: MirrorAction): @_mirror.register def _(self, a: MirrorSourceAction) -> Iterator[MirrorAction]: - public_sources = self._source_service.list_source_ids(self.catalog, - authentication=None) + public_sources = self._source_service.list_accessible_source_ids(self.catalog, + authentication=None) assert a.source.id in public_sources, R( 'Cannot mirror non-public source', a.source) plugin = self._repository_plugin diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index c68393f697..837ccf158a 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -654,9 +654,9 @@ def _assert_partition(self, source: SOURCE_REF, prefix: str): assert prefix in source.prefix, (source, prefix) @abstractmethod - def list_sources(self, - authentication: Authentication | None - ) -> Iterable[SOURCE_REF]: + def list_accessible_sources(self, + authentication: Authentication | None + ) -> Iterable[SOURCE_REF]: """ The sources the plugin is configured to read metadata from that are accessible using the provided authentication. Retrieving this @@ -666,20 +666,20 @@ def list_sources(self, """ raise NotImplementedError - def list_source_ids(self, - authentication: Authentication | None - ) -> set[str]: + def list_accessible_source_ids(self, + authentication: Authentication | None + ) -> set[str]: """ List source IDs in the underlying repository that are accessible using the provided authentication. Sources may be included even if they are not configured to be read from. Subclasses should override this method - if it can be implemented more efficiently than `list_sources`. + if it can be implemented more efficiently than `list_accessible_sources`. Retrieving this information may require a round-trip to the underlying repository. Implementations should raise PermissionError if the provided authentication is insufficient to access the repository. """ - return {source.id for source in self.list_sources(authentication)} + return {source.id for source in self.list_accessible_sources(authentication)} @cached_property def _generic_params(self) -> dict[TypeVar, type]: diff --git a/src/azul/plugins/repository/canned/__init__.py b/src/azul/plugins/repository/canned/__init__.py index 1db67a6947..a1bc91bebc 100644 --- a/src/azul/plugins/repository/canned/__init__.py +++ b/src/azul/plugins/repository/canned/__init__.py @@ -85,9 +85,9 @@ class Plugin(RepositoryPlugin[ ], HasCachedHttpClient): - def list_sources(self, - authentication: Authentication | None - ) -> list[CannedSourceRef]: + def list_accessible_sources(self, + authentication: Authentication | None + ) -> list[CannedSourceRef]: return [ CannedSourceRef(id=self._lookup_source_id(spec), spec=spec, prefix=None) for spec in self.sources diff --git a/src/azul/plugins/repository/dss/__init__.py b/src/azul/plugins/repository/dss/__init__.py index fc9d7b924d..5bfcda60b2 100644 --- a/src/azul/plugins/repository/dss/__init__.py +++ b/src/azul/plugins/repository/dss/__init__.py @@ -114,9 +114,9 @@ def count_bundles(self, source: DSSSourceRef) -> NoReturn: def count_files(self, source: DSSSourceRef) -> NoReturn: assert False, 'DSS is EOL' - def list_sources(self, - authentication: Authentication | None - ) -> list[DSSSourceRef]: + def list_accessible_sources(self, + authentication: Authentication | None + ) -> list[DSSSourceRef]: return [ DSSSourceRef(id=self._lookup_source_id(spec), spec=spec, prefix=None) for spec in self.sources diff --git a/src/azul/plugins/repository/tdr.py b/src/azul/plugins/repository/tdr.py index 4593dc3d20..6cd9242eb2 100644 --- a/src/azul/plugins/repository/tdr.py +++ b/src/azul/plugins/repository/tdr.py @@ -112,9 +112,9 @@ def _auth_fallback(self, tdr = self._user_authenticated_tdr(None) return tdr_callback(tdr) - def list_sources(self, - authentication: Authentication | None - ) -> list[TDRSourceRef]: + def list_accessible_sources(self, + authentication: Authentication | None + ) -> list[TDRSourceRef]: configured_specs_by_name = {spec.name: spec for spec in self.sources} # Filter by prefix of snapshot names in an attempt to speed up the # listing by limiting the number of irrelevant snapshots returned. Note @@ -136,9 +136,9 @@ def list_sources(self, for name, id in snapshot_ids_by_name.items() ] - def list_source_ids(self, - authentication: Authentication | None - ) -> set[str]: + def list_accessible_source_ids(self, + authentication: Authentication | None + ) -> set[str]: return self._auth_fallback(authentication, lambda tdr: tdr.snapshot_ids()) diff --git a/src/azul/service/source_controller.py b/src/azul/service/source_controller.py index 2865aff731..a63eb2030e 100644 --- a/src/azul/service/source_controller.py +++ b/src/azul/service/source_controller.py @@ -42,7 +42,7 @@ def list_sources(self, authentication: Authentication | None ) -> JSONs: try: - sources = self._source_service.list_sources(catalog, authentication) + sources = self._source_service.list_accessible_sources(catalog, authentication) except PermissionError: raise UnauthorizedError except LimitedTimeoutException as e: @@ -70,7 +70,8 @@ def _list_source_ids(self, authentication: Authentication | None ) -> set[str]: try: - source_ids = self._source_service.list_source_ids(catalog, authentication) + source_ids = self._source_service.list_accessible_source_ids(catalog, + authentication) except PermissionError: raise UnauthorizedError except LimitedTimeoutException as e: diff --git a/src/azul/service/source_service.py b/src/azul/service/source_service.py index 1bb532ea64..6d4d31fd77 100644 --- a/src/azul/service/source_service.py +++ b/src/azul/service/source_service.py @@ -53,10 +53,10 @@ class SourceService: def _repository_plugin(self, catalog: CatalogName) -> RepositoryPlugin: return RepositoryPlugin.load(catalog).create(catalog) - def list_source_ids(self, - catalog: CatalogName, - authentication: Authentication | None - ) -> set[str]: + def list_accessible_source_ids(self, + catalog: CatalogName, + authentication: Authentication | None + ) -> set[str]: plugin = self._repository_plugin(catalog) cache_key = ( @@ -69,15 +69,15 @@ def list_source_ids(self, try: source_ids = set(self._get(cache_key)) except CacheMiss: - source_ids = plugin.list_source_ids(authentication) + source_ids = plugin.list_accessible_source_ids(authentication) self._put(cache_key, list(source_ids)) return source_ids - def list_sources(self, - catalog: CatalogName, - authentication: Authentication | None - ) -> Iterable[SourceRef]: - return self._repository_plugin(catalog).list_sources(authentication) + def list_accessible_sources(self, + catalog: CatalogName, + authentication: Authentication | None + ) -> Iterable[SourceRef]: + return self._repository_plugin(catalog).list_accessible_sources(authentication) table_name = config.dynamo_sources_cache_table_name diff --git a/test/indexer/test_mirror_controller.py b/test/indexer/test_mirror_controller.py index bae3920ef2..cb4c637706 100644 --- a/test/indexer/test_mirror_controller.py +++ b/test/indexer/test_mirror_controller.py @@ -84,7 +84,7 @@ def app_name(cls) -> str: def setUpClass(cls): super().setUpClass() cls.addClassPatch(patch.object(SourceService, - 'list_source_ids', + 'list_accessible_source_ids', return_value={cls.source.id})) cls.addClassPatch(patch.object(MirrorAction, '_operation_id', diff --git a/test/service/test_response.py b/test/service/test_response.py index 0b88b40429..a15058160a 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -2186,7 +2186,8 @@ def _test(entity_type: str, expect_empty: bool, expect_accessible: bool): for entity_type in filtered_entity_types: _test(entity_type, expect_empty=False, expect_accessible=True) - with mock.patch('azul.plugins.repository.dss.Plugin.list_sources', return_value=[]): + with mock.patch('azul.plugins.repository.dss.Plugin.list_accessible_sources', + return_value=[]): for entity_type, is_filtered in filtered_entity_types.items(): _test(entity_type, expect_empty=is_filtered, expect_accessible=False) From 60f7b84c4af99867207c2e1583e93cf1990c7dd9 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 28 Jan 2026 16:55:41 -0800 Subject: [PATCH 08/15] Pull up {Simple,TDR}SourceSpec.name into superclass --- src/azul/indexer/__init__.py | 2 +- src/azul/terra.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py index 28af294e13..2b39813eed 100644 --- a/src/azul/indexer/__init__.py +++ b/src/azul/indexer/__init__.py @@ -423,6 +423,7 @@ class SourceSpec(Parseable, metaclass=ABCMeta): are structured might want to implement this abstract class. Plugins that have simple unstructured names may want to use :class:`SimpleSourceSpec`. """ + name: str @attrs.frozen(kw_only=True) @@ -430,7 +431,6 @@ class SimpleSourceSpec(SourceSpec): """ Default implementation for unstructured source names. """ - name: str @classmethod def parse(cls, spec: str) -> Self: diff --git a/src/azul/terra.py b/src/azul/terra.py index 609b4617f5..6fd875c600 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -123,7 +123,6 @@ class Domain(StrEnum): type: Type domain: Domain subdomain: str - name: str @classmethod def parse(cls, spec: str) -> Self: @@ -133,10 +132,10 @@ def parse(cls, spec: str) -> Self: >>> s = TDRSourceSpec.parse('tdr:bigquery:gcp:foo:bar') >>> s # doctest: +NORMALIZE_WHITESPACE - TDRSourceSpec(type=, + TDRSourceSpec(name='bar', + type=, domain=, - subdomain='foo', - name='bar') + subdomain='foo') >>> str(s) 'tdr:bigquery:gcp:foo:bar' From 5eef9164dcbc1cd2f4d6d6e980aea960f638318f Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Thu, 29 Jan 2026 15:38:14 -0800 Subject: [PATCH 09/15] Document uniqueness assumption of source names --- src/azul/indexer/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py index 2b39813eed..b9d63ccbdd 100644 --- a/src/azul/indexer/__init__.py +++ b/src/azul/indexer/__init__.py @@ -423,6 +423,7 @@ class SourceSpec(Parseable, metaclass=ABCMeta): are structured might want to implement this abstract class. Plugins that have simple unstructured names may want to use :class:`SimpleSourceSpec`. """ + #: Assumed to be unique per catalog. name: str From 0c5ba41f45f739f32665b26a1cca422c47eb1bca Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Mon, 12 Jan 2026 15:45:18 -0800 Subject: [PATCH 10/15] Extract protected methods from list_accessible_sources in TDR plugin --- src/azul/indexer/__init__.py | 4 +++- src/azul/plugins/__init__.py | 21 ++++++++++++++++++++ src/azul/plugins/repository/tdr.py | 31 ++++++++++++------------------ 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py index b9d63ccbdd..dc07d7cac0 100644 --- a/src/azul/indexer/__init__.py +++ b/src/azul/indexer/__init__.py @@ -466,7 +466,9 @@ class SourceRef[SOURCE_SPEC: SourceSpec]( Note to plugin implementers: Since the source ID can't be assumed to be globally unique, plugins should subclass this class, even if the subclass - body is empty. + body is empty. Additionally, subclasses must not add any fields that are + required by the constructor, since the base repository plugin needs to be + able to instantiate them generically. >>> spec = SimpleSourceSpec(name='') >>> prefix = Prefix(partition=0) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 837ccf158a..d344183c9e 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -653,6 +653,27 @@ def _assert_partition(self, source: SOURCE_REF, prefix: str): assert source.prefix is not None, source assert prefix in source.prefix, (source, prefix) + def _match_sources(self, + source_names_by_id: Mapping[str, str] + ) -> list[SOURCE_REF]: + """ + Filter the given sources to only include sources that the plugin is + configured to read metadata from, and instantiate them as `SourceRef`s. + """ + configured_specs_by_name = {spec.name: spec for spec in self.sources} + source_ids_by_name = { + name: id + for id, name in source_names_by_id.items() + if name in configured_specs_by_name + } + source_ref_cls = self.source_ref_cls + return [ + source_ref_cls(id=id, + spec=configured_specs_by_name[name], + prefix=None) + for name, id in source_ids_by_name.items() + ] + @abstractmethod def list_accessible_sources(self, authentication: Authentication | None diff --git a/src/azul/plugins/repository/tdr.py b/src/azul/plugins/repository/tdr.py index 6cd9242eb2..5c75eca18c 100644 --- a/src/azul/plugins/repository/tdr.py +++ b/src/azul/plugins/repository/tdr.py @@ -23,6 +23,7 @@ from azul import ( cache_per_thread, + cached_property, config, require, ) @@ -112,29 +113,21 @@ def _auth_fallback(self, tdr = self._user_authenticated_tdr(None) return tdr_callback(tdr) - def list_accessible_sources(self, - authentication: Authentication | None - ) -> list[TDRSourceRef]: - configured_specs_by_name = {spec.name: spec for spec in self.sources} - # Filter by prefix of snapshot names in an attempt to speed up the + @cached_property + def _common_source_filter(self) -> str: + # We filter by prefix of snapshot names in an attempt to speed up the # listing by limiting the number of irrelevant snapshots returned. Note # that TDR does a substring match, not a prefix match, but determining # the longest common substring is complicated and, as of yet, I haven't # found a trustworthy, reusable implementation. - filter = longest_common_prefix(configured_specs_by_name.keys()) - snapshots = self._auth_fallback(authentication, - lambda tdr: tdr.snapshot_names_by_id(filter=filter)) - snapshot_ids_by_name = { - name: id - for id, name in snapshots.items() - if name in configured_specs_by_name - } - return [ - TDRSourceRef(id=id, - spec=configured_specs_by_name[name], - prefix=None) - for name, id in snapshot_ids_by_name.items() - ] + return longest_common_prefix(spec.name for spec in self.sources) + + def list_accessible_sources(self, + authentication: Authentication | None + ) -> list[TDRSourceRef]: + names_by_id = self._auth_fallback(authentication, + lambda tdr: tdr.snapshot_names_by_id(filter=self._common_source_filter)) + return self._match_sources(names_by_id) def list_accessible_source_ids(self, authentication: Authentication | None From b42f587fd58719b3b5b011df01e8fa5ab70d60a0 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Thu, 15 Jan 2026 17:19:48 -0800 Subject: [PATCH 11/15] Extract variable --- test/integration_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/integration_test.py b/test/integration_test.py index 54d2c164cd..9495c10438 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -1583,9 +1583,10 @@ def _test_managed_access_repository_files(self, 'is': [ma_source.id] } }) + inner_files = [one(file['files']) for file in files] managed_access_file_urls = { - one(file['files'])['azul_url'] - for file in files + file['azul_url'] + for file in inner_files } file_url = furl(self.random.choice(sorted(managed_access_file_urls))) response = self._get_url_unchecked(GET, file_url) From 42ea01527b7950c04310aea1bb9f2c67cae66f3f Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Mon, 12 Jan 2026 17:08:22 -0800 Subject: [PATCH 12/15] Pull up MirrorService._source_service into superclass --- src/azul/indexer/mirror_service.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py index 2ca6294740..d827ca5eaa 100644 --- a/src/azul/indexer/mirror_service.py +++ b/src/azul/indexer/mirror_service.py @@ -344,6 +344,10 @@ def _storage(self) -> StorageService: bucket = aws.mirror_bucket return StorageService(bucket) + @cached_property + def _source_service(self) -> SourceService: + return SourceService() + def may_mirror_files_from_source(self, source_spec: SourceSpec) -> bool: """ Test whether it makes sense to request the mirroring of files from the @@ -519,10 +523,6 @@ class MirrorService(BaseMirrorService, HasCachedHttpClient): _schema_url_func: SchemaUrlFunc - @cached_property - def _source_service(self) -> SourceService: - return SourceService() - # We don't store the mirrored files' actual content type(s) in S3's # `Content-Type` metadata because a single file object may store the # contents of multiple file metadata entities, which may declare different From 84da6dcffa2e9da9fc34ec557343f86b1ad3c36b Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Mon, 26 Jan 2026 15:59:29 -0800 Subject: [PATCH 13/15] Make cached plugins public and remove redundant instantiation --- src/azul/indexer/mirror_service.py | 14 +++++++------- src/azul/service/source_service.py | 6 +++--- test/indexer/test_mirror_controller.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py index d827ca5eaa..40a1ee6a43 100644 --- a/src/azul/indexer/mirror_service.py +++ b/src/azul/indexer/mirror_service.py @@ -333,9 +333,9 @@ class BaseMirrorService: def _queues(self) -> Queues: return Queues() - @cached_property - def _repository_plugin(self) -> RepositoryPlugin: - return RepositoryPlugin.load(self.catalog).create(self.catalog) + @property + def repository_plugin(self) -> RepositoryPlugin: + return self._source_service.repository_plugin(self.catalog) @cached_property def _storage(self) -> StorageService: @@ -356,7 +356,7 @@ def may_mirror_files_from_source(self, source_spec: SourceSpec) -> bool: definitely refuse to mirror all files from the source. """ if self.may_mirror(): - plugin = self._repository_plugin + plugin = self.repository_plugin source_config = plugin.sources[source_spec] return source_config.mirror else: @@ -551,7 +551,7 @@ def _(self, a: MirrorSourceAction) -> Iterator[MirrorAction]: authentication=None) assert a.source.id in public_sources, R( 'Cannot mirror non-public source', a.source) - plugin = self._repository_plugin + plugin = self.repository_plugin # The desired partition size depends on the maximum number of messages # we can send in one Lambda invocation, because queueing the individual # mirror_file messages turns out to dominate the running time of @@ -576,7 +576,7 @@ def _(self, a: MirrorSourceAction) -> Iterator[MirrorAction]: @_mirror.register def _(self, a: MirrorPartitionAction) -> Iterator[MirrorAction]: - plugin = self._repository_plugin + plugin = self.repository_plugin files = plugin.list_files(a.source, a.prefix) for file in files: assert file.size is not None, R('File size unknown', file) @@ -722,7 +722,7 @@ def _repository_url(self, file: File) -> furl: 'Only TDR catalogs are supported', self.catalog) assert file.drs_uri is not None, R( 'File cannot be downloaded', file) - object = self._repository_plugin.drs_object(file.drs_uri) + object = self.repository_plugin.drs_object(file.drs_uri) access = object.get(AccessMethod.gs) assert access.method is AccessMethod.https, access return furl(access.url) diff --git a/src/azul/service/source_service.py b/src/azul/service/source_service.py index 6d4d31fd77..09ce18e57b 100644 --- a/src/azul/service/source_service.py +++ b/src/azul/service/source_service.py @@ -50,14 +50,14 @@ def __init__(self, key: str): class SourceService: @cache - def _repository_plugin(self, catalog: CatalogName) -> RepositoryPlugin: + def repository_plugin(self, catalog: CatalogName) -> RepositoryPlugin: return RepositoryPlugin.load(catalog).create(catalog) def list_accessible_source_ids(self, catalog: CatalogName, authentication: Authentication | None ) -> set[str]: - plugin = self._repository_plugin(catalog) + plugin = self.repository_plugin(catalog) cache_key = ( catalog, @@ -77,7 +77,7 @@ def list_accessible_sources(self, catalog: CatalogName, authentication: Authentication | None ) -> Iterable[SourceRef]: - return self._repository_plugin(catalog).list_accessible_sources(authentication) + return self.repository_plugin(catalog).list_accessible_sources(authentication) table_name = config.dynamo_sources_cache_table_name diff --git a/test/indexer/test_mirror_controller.py b/test/indexer/test_mirror_controller.py index cb4c637706..ef8e5a8b15 100644 --- a/test/indexer/test_mirror_controller.py +++ b/test/indexer/test_mirror_controller.py @@ -169,7 +169,7 @@ def _test_mirror_source(self, source_message): def _test_mirror_partition(self, partition_message, files: list[HCAFile]): event = self._mirror_event(partition_message) - plugin_cls = type(self.service._repository_plugin) + plugin_cls = type(self.service.repository_plugin) with patch.object(plugin_cls, 'list_files', return_value=files): self.mirror_controller.mirror(event) file_message = one(self._read_queue(self.service._mirror_queue())) From 63349fb85bb9359f6a880a5acfa185975512b5e8 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Mon, 12 Jan 2026 16:26:51 -0800 Subject: [PATCH 14/15] Fix: Mirror URI in manifest /index/files response is set for MA files (#7687) --- .../vendor/resources/sources.json.template.py | 1 + lambdas/lambdas.mk | 5 +- .../vendor/resources/sources.json.template.py | 1 + resources/sources.json.template.py | 8 +++ src/azul/indexer/mirror_service.py | 9 +++- src/azul/plugins/__init__.py | 8 +++ .../plugins/repository/canned/__init__.py | 3 ++ src/azul/plugins/repository/dss/__init__.py | 3 ++ src/azul/plugins/repository/tdr.py | 4 ++ src/azul/service/source_service.py | 54 +++++++++++++++++++ test/azul_test_case.py | 15 ++++++ test/integration_test.py | 2 + test/service/test_source_service.py | 54 +++++++++++++++++++ 13 files changed, 165 insertions(+), 2 deletions(-) create mode 120000 lambdas/indexer/vendor/resources/sources.json.template.py create mode 120000 lambdas/service/vendor/resources/sources.json.template.py create mode 100644 resources/sources.json.template.py diff --git a/lambdas/indexer/vendor/resources/sources.json.template.py b/lambdas/indexer/vendor/resources/sources.json.template.py new file mode 120000 index 0000000000..01f50d6e18 --- /dev/null +++ b/lambdas/indexer/vendor/resources/sources.json.template.py @@ -0,0 +1 @@ +../../../../resources/sources.json.template.py \ No newline at end of file diff --git a/lambdas/lambdas.mk b/lambdas/lambdas.mk index 84f4816385..df1b1a4218 100644 --- a/lambdas/lambdas.mk +++ b/lambdas/lambdas.mk @@ -44,6 +44,9 @@ config: .chalice/config.json .PHONY: environ environ: vendor/resources/environ.json +.PHONY: sources +sources: vendor/resources/sources.json + .PHONY: local local: check_python config chalice local @@ -52,7 +55,7 @@ local: check_python config clean: git_clean_recursive .PHONY: package -package: check_branch check_python check_aws config environ compile +package: check_branch check_python check_aws config environ sources compile chalice package --stage $(AZUL_DEPLOYMENT_STAGE) --pkg-format terraform .chalice/terraform .PHONY: openapi diff --git a/lambdas/service/vendor/resources/sources.json.template.py b/lambdas/service/vendor/resources/sources.json.template.py new file mode 120000 index 0000000000..01f50d6e18 --- /dev/null +++ b/lambdas/service/vendor/resources/sources.json.template.py @@ -0,0 +1 @@ +../../../../resources/sources.json.template.py \ No newline at end of file diff --git a/resources/sources.json.template.py b/resources/sources.json.template.py new file mode 100644 index 0000000000..b3fe2adc72 --- /dev/null +++ b/resources/sources.json.template.py @@ -0,0 +1,8 @@ +from azul.service.source_service import ( + SourceService, +) +from azul.template import ( + emit, +) + +emit(SourceService().configured_sources_for_outsourcing) diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py index 40a1ee6a43..93dd91391c 100644 --- a/src/azul/indexer/mirror_service.py +++ b/src/azul/indexer/mirror_service.py @@ -358,7 +358,14 @@ def may_mirror_files_from_source(self, source_spec: SourceSpec) -> bool: if self.may_mirror(): plugin = self.repository_plugin source_config = plugin.sources[source_spec] - return source_config.mirror + if source_config.mirror: + is_public = any( + source_spec == source.spec + for source in self._source_service.configured_public_sources + ) + return is_public + else: + return False else: return False diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index d344183c9e..a6e1a10215 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -687,6 +687,14 @@ def list_accessible_sources(self, """ raise NotImplementedError + @abstractmethod + def list_sources(self) -> Iterable[SOURCE_REF]: + """ + The sources the plugin is configured to read metadata from. Retrieving + this information may require a round-trip to the underlying repository. + """ + raise NotImplementedError + def list_accessible_source_ids(self, authentication: Authentication | None ) -> set[str]: diff --git a/src/azul/plugins/repository/canned/__init__.py b/src/azul/plugins/repository/canned/__init__.py index a1bc91bebc..e93997845c 100644 --- a/src/azul/plugins/repository/canned/__init__.py +++ b/src/azul/plugins/repository/canned/__init__.py @@ -88,6 +88,9 @@ class Plugin(RepositoryPlugin[ def list_accessible_sources(self, authentication: Authentication | None ) -> list[CannedSourceRef]: + return self.list_sources() + + def list_sources(self) -> list[CannedSourceRef]: return [ CannedSourceRef(id=self._lookup_source_id(spec), spec=spec, prefix=None) for spec in self.sources diff --git a/src/azul/plugins/repository/dss/__init__.py b/src/azul/plugins/repository/dss/__init__.py index 5bfcda60b2..61f92c59bb 100644 --- a/src/azul/plugins/repository/dss/__init__.py +++ b/src/azul/plugins/repository/dss/__init__.py @@ -117,6 +117,9 @@ def count_files(self, source: DSSSourceRef) -> NoReturn: def list_accessible_sources(self, authentication: Authentication | None ) -> list[DSSSourceRef]: + return self.list_sources() + + def list_sources(self) -> list[DSSSourceRef]: return [ DSSSourceRef(id=self._lookup_source_id(spec), spec=spec, prefix=None) for spec in self.sources diff --git a/src/azul/plugins/repository/tdr.py b/src/azul/plugins/repository/tdr.py index 5c75eca18c..c8bb94de77 100644 --- a/src/azul/plugins/repository/tdr.py +++ b/src/azul/plugins/repository/tdr.py @@ -135,6 +135,10 @@ def list_accessible_source_ids(self, return self._auth_fallback(authentication, lambda tdr: tdr.snapshot_ids()) + def list_sources(self) -> list[TDRSourceRef]: + names_by_id = self.tdr.snapshot_names_by_id(filter=self._common_source_filter) + return self._match_sources(names_by_id) + @property def tdr(self): return self._tdr() diff --git a/src/azul/service/source_service.py b/src/azul/service/source_service.py index 09ce18e57b..54a2e22f2d 100644 --- a/src/azul/service/source_service.py +++ b/src/azul/service/source_service.py @@ -4,13 +4,17 @@ time, ) from typing import ( + AbstractSet, Iterable, + TypedDict, ) from azul import ( CatalogName, + NotInLambdaContextException, cache, config, + open_resource, ) from azul.auth import ( Authentication, @@ -26,6 +30,8 @@ ) from azul.types import ( AnyJSON, + JSON, + json_element_mappings, ) log = logging.getLogger(__name__) @@ -47,6 +53,11 @@ def __init__(self, key: str): super().__init__(f'Entry for key {key!r} is expired') +class _ConfiguredSources(TypedDict): + all: AbstractSet[SourceRef] + public: AbstractSet[SourceRef] + + class SourceService: @cache @@ -121,3 +132,46 @@ def _put(self, key: str, sources: list[AnyJSON]) -> None: def _now(self) -> int: return int(time()) + + @cache + def _configured_sources(self) -> _ConfiguredSources: + try: + with open_resource('sources.json') as f: + sources = json.load(f) + except NotInLambdaContextException: + all_sources, public_sources = set(), set() + for catalog in config.catalogs.values(): + if not catalog.is_integration_test_catalog: + all_sources.update(self.repository_plugin(catalog.name).list_sources()) + public_sources.update(self.list_accessible_sources(catalog.name, + authentication=None)) + return { + 'all': all_sources, + 'public': public_sources, + } + else: + def parse(sources: AnyJSON) -> AbstractSet[SourceRef]: + return frozenset( + SourceRef.from_json(source) + for source in json_element_mappings(sources) + ) + + return { + 'all': parse(sources['all']), + 'public': parse(sources['public']), + } + + @property + def configured_sources(self) -> AbstractSet[SourceRef]: + return self._configured_sources()['all'] + + @property + def configured_public_sources(self) -> AbstractSet[SourceRef]: + return self._configured_sources()['public'] + + @property + def configured_sources_for_outsourcing(self) -> JSON: + return { + k: [source.to_json() for source in v] + for k, v in self._configured_sources().items() + } diff --git a/test/azul_test_case.py b/test/azul_test_case.py index 5102b746be..a251f0b0a7 100644 --- a/test/azul_test_case.py +++ b/test/azul_test_case.py @@ -68,6 +68,9 @@ from azul.plugins.repository.tdr_hca import ( TDRSourceRef, ) +from azul.service.source_service import ( + SourceService, +) from azul.terra import ( TDRSourceSpec, ) @@ -361,6 +364,7 @@ def setUpClass(cls) -> None: cls._patch_catalogs() cls._patch_replicas_enabled() cls._patch_deployment() + cls._patch_configured_sources() @classmethod def _patch_catalogs(cls): @@ -404,6 +408,17 @@ def _patch_deployment(cls): new_callable=PropertyMock, return_value=config.deployment.test_name)) + @classmethod + def _patch_configured_sources(cls): + cls.addClassPatch(patch.object(SourceService, + 'configured_sources', + new_callable=PropertyMock, + return_value={cls.source})) + cls.addClassPatch(patch.object(SourceService, + 'configured_public_sources', + new_callable=PropertyMock, + return_value={cls.source})) + class DSSTestCase(CatalogTestCase, metaclass=ABCMeta): """ diff --git a/test/integration_test.py b/test/integration_test.py index 9495c10438..a1503c226a 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -1584,6 +1584,8 @@ def _test_managed_access_repository_files(self, } }) inner_files = [one(file['files']) for file in files] + for file in inner_files: + self.assertIsNone(file['azul_mirror_uri']) managed_access_file_urls = { file['azul_url'] for file in inner_files diff --git a/test/service/test_source_service.py b/test/service/test_source_service.py index 736829ac16..094e541469 100644 --- a/test/service/test_source_service.py +++ b/test/service/test_source_service.py @@ -1,3 +1,4 @@ +import json import time from typing import ( Mapping, @@ -13,11 +14,20 @@ ScalarAttributeTypeType, ) +from azul import ( + NotInLambdaContextException, +) +from azul.plugins.repository.dss import ( + DSSSourceRef, +) from azul.service.source_service import ( Expired, NotFound, SourceService, ) +from azul_test_case import ( + AzulUnitTestCase, +) from dynamodb_test_case import ( DynamoDBTestCase, ) @@ -49,3 +59,47 @@ def test_source_cache(self): time.sleep(self.wait + 1) with self.assertRaises(Expired): service._get(key) + + +class TestConfiguredSources(AzulUnitTestCase): + public_sources = {DSSSourceRef.for_dss_source('foo', '/0')} + all_sources = {*public_sources, DSSSourceRef.for_dss_source('bar', '/1')} + configured_sources_for_outsourcing = { + 'all': [s.to_json() for s in all_sources], + 'public': [s.to_json() for s in public_sources], + } + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + + class MockPlugin: + + def list_accessible_sources(self, authentication): + assert authentication is None, authentication + return TestConfiguredSources.public_sources + + def list_sources(self): + return TestConfiguredSources.all_sources + + cls.addClassPatch(mock.patch.object(SourceService, + 'repository_plugin', + return_value=MockPlugin())) + + @mock.patch('azul.service.source_service.open_resource', + side_effect=NotInLambdaContextException('')) + def test_outside_lambda(self, open_resource): + self._test() + open_resource.assert_called_once() + + @mock.patch('azul.service.source_service.open_resource', + new_callable=mock.mock_open, + read_data=json.dumps(configured_sources_for_outsourcing)) + def test_inside_lambda(self, open_resource): + self._test() + open_resource.assert_called_once() + + def _test(self): + service = SourceService() + self.assertSetEqual(self.all_sources, service.configured_sources) + self.assertSetEqual(self.public_sources, service.configured_public_sources) From 71b5d8ae6986fb41306362ac56af5b273eb25fdd Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 28 Jan 2026 18:05:29 -0800 Subject: [PATCH 15/15] Fix: OpenSearch queries include many redundant source IDs (#7644) --- src/azul/service/source_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/azul/service/source_service.py b/src/azul/service/source_service.py index 54a2e22f2d..53eea7059e 100644 --- a/src/azul/service/source_service.py +++ b/src/azul/service/source_service.py @@ -81,6 +81,8 @@ def list_accessible_source_ids(self, source_ids = set(self._get(cache_key)) except CacheMiss: source_ids = plugin.list_accessible_source_ids(authentication) + configured_source_ids = {source.id for source in self.configured_sources} + source_ids &= configured_source_ids self._put(cache_key, list(source_ids)) return source_ids