Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG-say-see-mode.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- Add "Say & See Mode" tab to the donor, sample, and dataset search pages with an embedded UDI chat (`udi-yac`) wired to the portal-ui back-end. Authenticated HuBMAP-Read users use server-side AI credentials; other users are prompted for an OpenAI API key.
- Restore a top-level search input attached above the results table, alongside the existing filter chips and selected-items rows.
- Add a dismissible promo alert above the page title and dialog on page open. Dismissal is persisted in `localStorage` and shared across the three search pages.
- Support deep-linking to either tab via a new `mode=filter|say-see` URL parameter.
- Backend: `/metadata/v0/udi/datapackage.json` and `/metadata/v0/udi/<entity>.tsv` now accept `?public=1` to serve from the shared cache regardless of session, generated with a tokenless ApiClient. Public-scope responses emit `Cache-Control: public, max-age=43200` + `ETag`; authenticated responses emit `Cache-Control: private, no-store`.
- Match the search page's default filter on the UDI data routes by excluding entities with `next_revision_uuid` or `sub_status`, so authenticated dataset counts in Say & See align with `/search/datasets`.
6 changes: 6 additions & 0 deletions context/app/default_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,9 @@ class DefaultConfig(object):
# HuBMAP-Read users. If unset, all callers must supply X-OpenAI-Key.
OPENAI_API_KEY = None
UDI_GPT_MODEL_NAME = 'gpt-5.4'

# Optional: Langfuse observability for UDIAgent. Tracing is enabled when
# any of these is set; otherwise the plain OpenAI client is used.
LANGFUSE_PUBLIC_KEY = None
LANGFUSE_SECRET_KEY = None
LANGFUSE_BASE_URL = None
48 changes: 44 additions & 4 deletions context/app/routes_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,12 @@ def metadata_descriptions():


def _generate_tsv_response(
entity_type: str, with_descriptions: bool = True, cors_origin: Optional[str] = None
entity_type: str,
with_descriptions: bool = True,
cors_origin: Optional[str] = None,
use_groups_token: bool = True,
excluded_fields: Optional[list] = None,
exclude_revisions: bool = False,
):
if request.method == 'GET':
all_args = request.args.to_dict(flat=False)
Expand All @@ -76,7 +81,14 @@ def _generate_tsv_response(
constraints = {}
uuids = body.get('uuids')

entities = _get_entities(entity_type, constraints, uuids)
entities = _get_entities(
entity_type,
constraints,
uuids,
use_groups_token=use_groups_token,
excluded_fields=excluded_fields,
exclude_revisions=exclude_revisions,
)

if with_descriptions:
descriptions_dict = metadata_descriptions()
Expand Down Expand Up @@ -115,10 +127,17 @@ def lineup(entity_type):
_first_fields = ['uuid', 'hubmap_id']


def _get_entities(entity_type, constraints={}, uuids=None):
def _get_entities(
entity_type,
constraints={},
uuids=None,
use_groups_token=True,
excluded_fields=None,
exclude_revisions=False,
):
if entity_type not in ['donors', 'samples', 'datasets']:
abort(404)
client = get_client()
client = get_client(use_groups_token=use_groups_token)
extra_fields = _first_fields[:]
extra_fields += [
# Version number is not in document:
Expand Down Expand Up @@ -149,13 +168,34 @@ def _get_entities(entity_type, constraints={}, uuids=None):
post_filter_extra = None
if entity_type == 'samples':
post_filter_extra = {'exists': {'field': 'descendant_counts.entity_type.Dataset'}}

# When `exclude_revisions` is set, mirror the search-page default filter so
# superseded revisions and sub-status'd entities don't inflate counts. Pass
# via `query_override` since `client.get_entities`'s `_make_query` doesn't
# accept must_not clauses.
query_override = None
if exclude_revisions:
query_override = {
'bool': {
'must_not': [
{'exists': {'field': 'next_revision_uuid'}},
{'exists': {'field': 'sub_status'}},
],
},
}

entities = client.get_entities(
plural_lc_entity_type=entity_type,
non_metadata_fields=extra_fields,
constraints=constraints,
uuids=uuids,
post_filter_extra=post_filter_extra,
query_override=query_override,
)
if excluded_fields:
for entity in entities:
for field in excluded_fields:
entity.pop(field, None)
return entities


Expand Down
94 changes: 75 additions & 19 deletions context/app/routes_udi.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
import json
import os
import time
Expand Down Expand Up @@ -58,6 +59,33 @@ def _set_cached(key, data):
_udi_cache[key] = (time.time(), data)


# Fields to strip from UDI responses. `assaytype` overlaps semantically with the
# dataset_type field and creates confusion in the chat UI's schema.
_UDI_EXCLUDED_FIELDS = ['assaytype']


def _wants_public_scope():
"""Logged-in users can opt in to the cached public datapackage/TSVs by
appending ?public=1, trading per-user data access for shared-cache speed."""
return request.args.get('public') in ('1', 'true')


def _serve_in_public_scope():
return not _is_authenticated() or _wants_public_scope()


def _apply_cache_headers(response, *, public, etag_payload=None):
"""Cacheable when the response is generated in public scope; otherwise
`private, no-store` to prevent shared caches from leaking authed data."""
if public:
response.headers['Cache-Control'] = f'public, max-age={_UDI_CACHE_TTL}'
if etag_payload is not None:
response.headers['ETag'] = f'"{hashlib.md5(etag_payload).hexdigest()}"' # noqa: S324 (non-security ETag)
else:
response.headers['Cache-Control'] = 'private, no-store'
return response


_UDI_ALLOWED_ORIGINS = ['https://hms-dbmi.github.io']
# Extra origins allowed when the app is running in dev/test mode (Flask's
# --debug flag or TESTING=True) so the standalone chat frontend on Vite's
Expand Down Expand Up @@ -100,6 +128,9 @@ def _build_orchestrator(openai_api_key):
agent = UDIAgent(
gpt_model_name=current_app.config.get('UDI_GPT_MODEL_NAME', 'gpt-5.4'),
openai_api_key=openai_api_key,
langfuse_public_key=current_app.config.get('LANGFUSE_PUBLIC_KEY'),
langfuse_secret_key=current_app.config.get('LANGFUSE_SECRET_KEY'),
langfuse_host=current_app.config.get('LANGFUSE_BASE_URL'),
)
return Orchestrator(agent=agent)

Expand Down Expand Up @@ -142,40 +173,55 @@ def _pick_orchestrator():
# removes CORS block.
@blueprint.route('/metadata/v0/udi/<entity_type>.tsv', methods=['GET', 'POST'])
def entities_plain_tsv(entity_type):
if not _is_authenticated():
public_scope = _serve_in_public_scope()

if public_scope:
cached = _get_cached(f'tsv:{entity_type}')
if cached:
tsv, filename = cached
response = make_response(tsv)
response.headers['Content-Type'] = 'text/tab-separated-values; charset=utf-8'
response.headers['Content-Disposition'] = f'attachment; filename={filename}'
return response

response = _generate_tsv_response(entity_type, with_descriptions=False)
return _apply_cache_headers(response, public=True, etag_payload=tsv.encode('utf-8'))

response = _generate_tsv_response(
entity_type,
with_descriptions=False,
use_groups_token=not public_scope,
excluded_fields=_UDI_EXCLUDED_FIELDS,
exclude_revisions=True,
)

if not _is_authenticated():
if public_scope:
tsv = response.get_data(as_text=True)
filename = response.headers.get('Content-Disposition', '').split('filename=')[-1]
_set_cached(f'tsv:{entity_type}', (tsv, filename))
return _apply_cache_headers(response, public=True, etag_payload=tsv.encode('utf-8'))

return response
return _apply_cache_headers(response, public=False)


@blueprint.route('/metadata/v0/udi/datapackage.json', methods=['GET'])
def udi_datapackage():
# This endpoint serves the datapackage.json used to power the UDI chat.

# If a user is not authenticated, we cache the generated datapackage for 12 hours to improve
# load times, since generating the datapackage involves multiple API calls and can be slow.
if not _is_authenticated():
#
# Anonymous users — and authenticated users who pass ?public=1 — are served
# from a 12-hour shared cache for fast Say & See open times. Authenticated
# users without ?public=1 get a fresh build under their own groups_token so
# the datapackage reflects their actual data access.
public_scope = _serve_in_public_scope()

if public_scope:
cached = _get_cached('datapackage')
if cached:
return jsonify(cached)

# If a user is authenticated, we do not cache, since they may have access to different data
# and we want to ensure they get the correct datapackage.
response = jsonify(cached)
return _apply_cache_headers(
response,
public=True,
etag_payload=json.dumps(cached, sort_keys=True).encode('utf-8'),
)

client = get_client()
client = get_client(use_groups_token=not public_scope)

field_descriptions_raw = client.get_metadata_descriptions()
descriptions_dict = {
Expand All @@ -187,7 +233,12 @@ def udi_datapackage():

resources = []
for entity_type in ['donors', 'samples', 'datasets']:
entities = _get_entities(entity_type)
entities = _get_entities(
entity_type,
use_groups_token=not public_scope,
excluded_fields=_UDI_EXCLUDED_FIELDS,
exclude_revisions=True,
)
resource = build_resource(
entity_type, entities, descriptions_dict, types_dict, _first_fields
)
Expand All @@ -200,11 +251,16 @@ def udi_datapackage():
'udi:path': f'{get_url_base_from_request()}/metadata/v0/udi/',
}

# Update the cache for unauthenticated users so subsequent requests are faster.
if not _is_authenticated():
response = jsonify(datapackage)
if public_scope:
_set_cached('datapackage', datapackage)
return _apply_cache_headers(
response,
public=True,
etag_payload=json.dumps(datapackage, sort_keys=True).encode('utf-8'),
)

return jsonify(datapackage)
return _apply_cache_headers(response, public=False)


@blueprint.route('/v1/yac/completions', methods=['POST'])
Expand Down
Loading
Loading