hubmapconsortium · NickAkhmetov · Apr 30, 2026 · Apr 22, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/CHANGELOG-say-see-mode.md b/CHANGELOG-say-see-mode.md
@@ -0,0 +1,6 @@
+- Add "Say & See Mode" tab to the donor, sample, and dataset search pages with an embedded UDI chat (`udi-yac`) wired to the portal-ui back-end. Authenticated HuBMAP-Read users use server-side AI credentials; other users are prompted for an OpenAI API key.
+- Restore a top-level search input attached above the results table, alongside the existing filter chips and selected-items rows.
+- Add a dismissible promo alert above the page title and dialog on page open. Dismissal is persisted in `localStorage` and shared across the three search pages.
+- Support deep-linking to either tab via a new `mode=filter|say-see` URL parameter.
+- Backend: `/metadata/v0/udi/datapackage.json` and `/metadata/v0/udi/<entity>.tsv` now accept `?public=1` to serve from the shared cache regardless of session, generated with a tokenless ApiClient. Public-scope responses emit `Cache-Control: public, max-age=43200` + `ETag`; authenticated responses emit `Cache-Control: private, no-store`.
+- Match the search page's default filter on the UDI data routes by excluding entities with `next_revision_uuid` or `sub_status`, so authenticated dataset counts in Say & See align with `/search/datasets`.
diff --git a/context/app/default_config.py b/context/app/default_config.py
@@ -62,3 +62,9 @@ class DefaultConfig(object):
     # HuBMAP-Read users. If unset, all callers must supply X-OpenAI-Key.
     OPENAI_API_KEY = None
     UDI_GPT_MODEL_NAME = 'gpt-5.4'
+
+    # Optional: Langfuse observability for UDIAgent. Tracing is enabled when
+    # any of these is set; otherwise the plain OpenAI client is used.
+    LANGFUSE_PUBLIC_KEY = None
+    LANGFUSE_SECRET_KEY = None
+    LANGFUSE_BASE_URL = None
diff --git a/context/app/routes_api.py b/context/app/routes_api.py
@@ -62,7 +62,12 @@ def metadata_descriptions():
 
 
 def _generate_tsv_response(
-    entity_type: str, with_descriptions: bool = True, cors_origin: Optional[str] = None
+    entity_type: str,
+    with_descriptions: bool = True,
+    cors_origin: Optional[str] = None,
+    use_groups_token: bool = True,
+    excluded_fields: Optional[list] = None,
+    exclude_revisions: bool = False,
 ):
     if request.method == 'GET':
         all_args = request.args.to_dict(flat=False)
@@ -76,7 +81,14 @@ def _generate_tsv_response(
         constraints = {}
         uuids = body.get('uuids')
 
-    entities = _get_entities(entity_type, constraints, uuids)
+    entities = _get_entities(
+        entity_type,
+        constraints,
+        uuids,
+        use_groups_token=use_groups_token,
+        excluded_fields=excluded_fields,
+        exclude_revisions=exclude_revisions,
+    )
 
     if with_descriptions:
         descriptions_dict = metadata_descriptions()
@@ -115,10 +127,17 @@ def lineup(entity_type):
 _first_fields = ['uuid', 'hubmap_id']
 
 
-def _get_entities(entity_type, constraints={}, uuids=None):
+def _get_entities(
+    entity_type,
+    constraints={},
+    uuids=None,
+    use_groups_token=True,
+    excluded_fields=None,
+    exclude_revisions=False,
+):
     if entity_type not in ['donors', 'samples', 'datasets']:
         abort(404)
-    client = get_client()
+    client = get_client(use_groups_token=use_groups_token)
     extra_fields = _first_fields[:]
     extra_fields += [
         # Version number is not in document:
@@ -149,13 +168,34 @@ def _get_entities(entity_type, constraints={}, uuids=None):
     post_filter_extra = None
     if entity_type == 'samples':
         post_filter_extra = {'exists': {'field': 'descendant_counts.entity_type.Dataset'}}
+
+    # When `exclude_revisions` is set, mirror the search-page default filter so
+    # superseded revisions and sub-status'd entities don't inflate counts. Pass
+    # via `query_override` since `client.get_entities`'s `_make_query` doesn't
+    # accept must_not clauses.
+    query_override = None
+    if exclude_revisions:
+        query_override = {
+            'bool': {
+                'must_not': [
+                    {'exists': {'field': 'next_revision_uuid'}},
+                    {'exists': {'field': 'sub_status'}},
+                ],
+            },
+        }
+
     entities = client.get_entities(
         plural_lc_entity_type=entity_type,
         non_metadata_fields=extra_fields,
         constraints=constraints,
         uuids=uuids,
         post_filter_extra=post_filter_extra,
+        query_override=query_override,
     )
+    if excluded_fields:
+        for entity in entities:
+            for field in excluded_fields:
+                entity.pop(field, None)
     return entities
 
 

diff --git a/context/app/routes_udi.py b/context/app/routes_udi.py
@@ -1,3 +1,4 @@
+import hashlib
 import json
 import os
 import time
@@ -58,6 +59,33 @@ def _set_cached(key, data):
     _udi_cache[key] = (time.time(), data)
 
 
+# Fields to strip from UDI responses. `assaytype` overlaps semantically with the
+# dataset_type field and creates confusion in the chat UI's schema.
+_UDI_EXCLUDED_FIELDS = ['assaytype']
+
+
+def _wants_public_scope():
+    """Logged-in users can opt in to the cached public datapackage/TSVs by
+    appending ?public=1, trading per-user data access for shared-cache speed."""
+    return request.args.get('public') in ('1', 'true')
+
+
+def _serve_in_public_scope():
+    return not _is_authenticated() or _wants_public_scope()
+
+
+def _apply_cache_headers(response, *, public, etag_payload=None):
+    """Cacheable when the response is generated in public scope; otherwise
+    `private, no-store` to prevent shared caches from leaking authed data."""
+    if public:
+        response.headers['Cache-Control'] = f'public, max-age={_UDI_CACHE_TTL}'
+        if etag_payload is not None:
+            response.headers['ETag'] = f'"{hashlib.md5(etag_payload).hexdigest()}"'  # noqa: S324  (non-security ETag)
+    else:
+        response.headers['Cache-Control'] = 'private, no-store'
+    return response
+
+
 _UDI_ALLOWED_ORIGINS = ['https://hms-dbmi.github.io']
 # Extra origins allowed when the app is running in dev/test mode (Flask's
 # --debug flag or TESTING=True) so the standalone chat frontend on Vite's
@@ -100,6 +128,9 @@ def _build_orchestrator(openai_api_key):
     agent = UDIAgent(
         gpt_model_name=current_app.config.get('UDI_GPT_MODEL_NAME', 'gpt-5.4'),
         openai_api_key=openai_api_key,
+        langfuse_public_key=current_app.config.get('LANGFUSE_PUBLIC_KEY'),
+        langfuse_secret_key=current_app.config.get('LANGFUSE_SECRET_KEY'),
+        langfuse_host=current_app.config.get('LANGFUSE_BASE_URL'),
     )
     return Orchestrator(agent=agent)
 
@@ -142,40 +173,55 @@ def _pick_orchestrator():
 # removes CORS block.
 @blueprint.route('/metadata/v0/udi/<entity_type>.tsv', methods=['GET', 'POST'])
 def entities_plain_tsv(entity_type):
-    if not _is_authenticated():
+    public_scope = _serve_in_public_scope()
+
+    if public_scope:
         cached = _get_cached(f'tsv:{entity_type}')
         if cached:
             tsv, filename = cached
             response = make_response(tsv)
             response.headers['Content-Type'] = 'text/tab-separated-values; charset=utf-8'
             response.headers['Content-Disposition'] = f'attachment; filename={filename}'
-            return response
-
-    response = _generate_tsv_response(entity_type, with_descriptions=False)
+            return _apply_cache_headers(response, public=True, etag_payload=tsv.encode('utf-8'))
+
+    response = _generate_tsv_response(
+        entity_type,
+        with_descriptions=False,
+        use_groups_token=not public_scope,
+        excluded_fields=_UDI_EXCLUDED_FIELDS,
+        exclude_revisions=True,
+    )
 
-    if not _is_authenticated():
+    if public_scope:
         tsv = response.get_data(as_text=True)
         filename = response.headers.get('Content-Disposition', '').split('filename=')[-1]
         _set_cached(f'tsv:{entity_type}', (tsv, filename))
+        return _apply_cache_headers(response, public=True, etag_payload=tsv.encode('utf-8'))
 
-    return response
+    return _apply_cache_headers(response, public=False)
 
 
 @blueprint.route('/metadata/v0/udi/datapackage.json', methods=['GET'])
 def udi_datapackage():
     # This endpoint serves the datapackage.json used to power the UDI chat.
-
-    # If a user is not authenticated, we cache the generated datapackage for 12 hours to improve
-    # load times, since generating the datapackage involves multiple API calls and can be slow.
-    if not _is_authenticated():
+    #
+    # Anonymous users — and authenticated users who pass ?public=1 — are served
+    # from a 12-hour shared cache for fast Say & See open times. Authenticated
+    # users without ?public=1 get a fresh build under their own groups_token so
+    # the datapackage reflects their actual data access.
+    public_scope = _serve_in_public_scope()
+
+    if public_scope:
         cached = _get_cached('datapackage')
         if cached:
-            return jsonify(cached)
-
-    # If a user is authenticated, we do not cache, since they may have access to different data
-    # and we want to ensure they get the correct datapackage.
+            response = jsonify(cached)
+            return _apply_cache_headers(
+                response,
+                public=True,
+                etag_payload=json.dumps(cached, sort_keys=True).encode('utf-8'),
+            )
 
-    client = get_client()
+    client = get_client(use_groups_token=not public_scope)
 
     field_descriptions_raw = client.get_metadata_descriptions()
     descriptions_dict = {
@@ -187,7 +233,12 @@ def udi_datapackage():
 
     resources = []
     for entity_type in ['donors', 'samples', 'datasets']:
-        entities = _get_entities(entity_type)
+        entities = _get_entities(
+            entity_type,
+            use_groups_token=not public_scope,
+            excluded_fields=_UDI_EXCLUDED_FIELDS,
+            exclude_revisions=True,
+        )
         resource = build_resource(
             entity_type, entities, descriptions_dict, types_dict, _first_fields
         )
@@ -200,11 +251,16 @@ def udi_datapackage():
         'udi:path': f'{get_url_base_from_request()}/metadata/v0/udi/',
     }
 
-    # Update the cache for unauthenticated users so subsequent requests are faster.
-    if not _is_authenticated():
+    response = jsonify(datapackage)
+    if public_scope:
         _set_cached('datapackage', datapackage)
+        return _apply_cache_headers(
+            response,
+            public=True,
+            etag_payload=json.dumps(datapackage, sort_keys=True).encode('utf-8'),
+        )
 
-    return jsonify(datapackage)
+    return _apply_cache_headers(response, public=False)
 
 
 @blueprint.route('/v1/yac/completions', methods=['POST'])