diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 36b2aff..f996350 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.1.0-alpha.34" + ".": "0.1.0-alpha.35" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index c021c17..7a6c349 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,3 +1,3 @@ configured_endpoints: 70 -openapi_spec_hash: 9018ebfb2a9e1afa87058b3a4bd41b0b +openapi_spec_hash: 11279400677011ad5dc1ebba33216ae4 config_hash: aad16f20fed13ac50211fc1d0e2ea621 diff --git a/CHANGELOG.md b/CHANGELOG.md index 7782cb1..95cf32e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## 0.1.0-alpha.35 (2025-12-17) + +Full Changelog: [v0.1.0-alpha.34...v0.1.0-alpha.35](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.34...v0.1.0-alpha.35) + +### Features + +* **api:** api update ([7f283d7](https://github.com/cleanlab/codex-python/commit/7f283d7abb4b9b79de86c88745fb66ea9943cdae)) +* **api:** api update ([7742c60](https://github.com/cleanlab/codex-python/commit/7742c60ecad518656a184513c5228a3447aa34c9)) +* **api:** api update ([94bacaf](https://github.com/cleanlab/codex-python/commit/94bacaf492809bc9bc15175d272de53ad2569895)) +* **api:** api update ([884de94](https://github.com/cleanlab/codex-python/commit/884de944e616b26580830817486bb85e74f1e7c4)) + + +### Bug Fixes + +* ensure streams are always closed ([2c971c4](https://github.com/cleanlab/codex-python/commit/2c971c4a93b0e407737648e83e555dc6c9b3a759)) +* **types:** allow pyright to infer TypedDict types within SequenceNotStr ([d64e474](https://github.com/cleanlab/codex-python/commit/d64e47443ef147240de6cba892e901dcab0b2d71)) + + +### Chores + +* add missing docstrings ([250433e](https://github.com/cleanlab/codex-python/commit/250433e37cb8ba034de2977ee6375f06390cc6c4)) +* add Python 3.14 classifier and testing ([4dec29c](https://github.com/cleanlab/codex-python/commit/4dec29cdf74dd3beeccf326678db7170156f0c44)) +* **deps:** mypy 1.18.1 has a regression, pin to 1.17 ([1828526](https://github.com/cleanlab/codex-python/commit/18285268b4eec848b2be2df65cdbdf960424f72d)) +* **internal:** add missing files argument to base client ([c8986ce](https://github.com/cleanlab/codex-python/commit/c8986ce9fa0eae5726ba6cb6692dfa11c60284f5)) +* speedup initial import ([9f17615](https://github.com/cleanlab/codex-python/commit/9f17615353be5ee705ea2f4713d9dc790b2ecb3b)) +* update lockfile ([230659a](https://github.com/cleanlab/codex-python/commit/230659a94b4921805c84224578df1324829e5d07)) + ## 0.1.0-alpha.34 (2025-11-19) Full Changelog: [v0.1.0-alpha.33...v0.1.0-alpha.34](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.33...v0.1.0-alpha.34) diff --git a/pyproject.toml b/pyproject.toml index 5fb2418..d65296d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,22 @@ [project] name = "codex-sdk" -version = "0.1.0-alpha.34" +version = "0.1.0-alpha.35" description = "Internal SDK used within cleanlab-codex package. Refer to https://pypi.org/project/cleanlab-codex/ instead." dynamic = ["readme"] license = "MIT" authors = [ { name = "Cleanlab", email = "team@cleanlab.ai" }, ] + dependencies = [ - "httpx>=0.23.0, <1", - "pydantic>=1.9.0, <3", - "typing-extensions>=4.10, <5", - "anyio>=3.5.0, <5", - "distro>=1.7.0, <2", - "sniffio", + "httpx>=0.23.0, <1", + "pydantic>=1.9.0, <3", + "typing-extensions>=4.10, <5", + "anyio>=3.5.0, <5", + "distro>=1.7.0, <2", + "sniffio", ] + requires-python = ">= 3.9" classifiers = [ "Typing :: Typed", @@ -24,6 +26,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", "Operating System :: POSIX", "Operating System :: MacOS", @@ -45,7 +48,7 @@ managed = true # version pins are in requirements-dev.lock dev-dependencies = [ "pyright==1.1.399", - "mypy", + "mypy==1.17", "respx", "pytest", "pytest-asyncio", diff --git a/requirements-dev.lock b/requirements-dev.lock index d728372..90dc04b 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -12,40 +12,45 @@ -e file:. aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.8 +aiohttp==3.13.2 # via codex-sdk # via httpx-aiohttp -aiosignal==1.3.2 +aiosignal==1.4.0 # via aiohttp -annotated-types==0.6.0 +annotated-types==0.7.0 # via pydantic -anyio==4.4.0 +anyio==4.12.0 # via codex-sdk # via httpx -argcomplete==3.1.2 +argcomplete==3.6.3 # via nox async-timeout==5.0.1 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -certifi==2023.7.22 + # via nox +backports-asyncio-runner==1.2.0 + # via pytest-asyncio +certifi==2025.11.12 # via httpcore # via httpx -colorlog==6.7.0 +colorlog==6.10.1 + # via nox +dependency-groups==1.3.1 # via nox -dirty-equals==0.6.0 -distlib==0.3.7 +dirty-equals==0.11 +distlib==0.4.0 # via virtualenv -distro==1.8.0 +distro==1.9.0 # via codex-sdk -exceptiongroup==1.2.2 +exceptiongroup==1.3.1 # via anyio # via pytest -execnet==2.1.1 +execnet==2.1.2 # via pytest-xdist -filelock==3.12.4 +filelock==3.19.1 # via virtualenv -frozenlist==1.6.2 +frozenlist==1.8.0 # via aiohttp # via aiosignal h11==0.16.0 @@ -58,80 +63,87 @@ httpx==0.28.1 # via respx httpx-aiohttp==0.1.9 # via codex-sdk -idna==3.4 +humanize==4.13.0 + # via nox +idna==3.11 # via anyio # via httpx # via yarl -importlib-metadata==7.0.0 -iniconfig==2.0.0 +importlib-metadata==8.7.0 +iniconfig==2.1.0 # via pytest markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -multidict==6.4.4 +multidict==6.7.0 # via aiohttp # via yarl -mypy==1.14.1 -mypy-extensions==1.0.0 +mypy==1.17.0 +mypy-extensions==1.1.0 # via mypy -nodeenv==1.8.0 +nodeenv==1.9.1 # via pyright -nox==2023.4.22 -packaging==23.2 +nox==2025.11.12 +packaging==25.0 + # via dependency-groups # via nox # via pytest -platformdirs==3.11.0 +pathspec==0.12.1 + # via mypy +platformdirs==4.4.0 # via virtualenv -pluggy==1.5.0 +pluggy==1.6.0 # via pytest -propcache==0.3.1 +propcache==0.4.1 # via aiohttp # via yarl -pydantic==2.11.9 +pydantic==2.12.5 # via codex-sdk -pydantic-core==2.33.2 +pydantic-core==2.41.5 # via pydantic -pygments==2.18.0 +pygments==2.19.2 + # via pytest # via rich pyright==1.1.399 -pytest==8.3.3 +pytest==8.4.2 # via pytest-asyncio # via pytest-xdist -pytest-asyncio==0.24.0 -pytest-xdist==3.7.0 -python-dateutil==2.8.2 +pytest-asyncio==1.2.0 +pytest-xdist==3.8.0 +python-dateutil==2.9.0.post0 # via time-machine -pytz==2023.3.post1 - # via dirty-equals respx==0.22.0 -rich==13.7.1 -ruff==0.9.4 -setuptools==68.2.2 - # via nodeenv -six==1.16.0 +rich==14.2.0 +ruff==0.14.7 +six==1.17.0 # via python-dateutil -sniffio==1.3.0 - # via anyio +sniffio==1.3.1 # via codex-sdk -time-machine==2.9.0 -tomli==2.0.2 +time-machine==2.19.0 +tomli==2.3.0 + # via dependency-groups # via mypy + # via nox # via pytest -typing-extensions==4.12.2 +typing-extensions==4.15.0 + # via aiosignal # via anyio # via codex-sdk + # via exceptiongroup # via multidict # via mypy # via pydantic # via pydantic-core # via pyright + # via pytest-asyncio # via typing-inspection -typing-inspection==0.4.1 + # via virtualenv +typing-inspection==0.4.2 # via pydantic -virtualenv==20.24.5 +virtualenv==20.35.4 # via nox -yarl==1.20.0 +yarl==1.22.0 # via aiohttp -zipp==3.17.0 +zipp==3.23.0 # via importlib-metadata diff --git a/requirements.lock b/requirements.lock index 4b916da..2cedc49 100644 --- a/requirements.lock +++ b/requirements.lock @@ -12,28 +12,28 @@ -e file:. aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.12.8 +aiohttp==3.13.2 # via codex-sdk # via httpx-aiohttp -aiosignal==1.3.2 +aiosignal==1.4.0 # via aiohttp -annotated-types==0.6.0 +annotated-types==0.7.0 # via pydantic -anyio==4.4.0 +anyio==4.12.0 # via codex-sdk # via httpx async-timeout==5.0.1 # via aiohttp -attrs==25.3.0 +attrs==25.4.0 # via aiohttp -certifi==2023.7.22 +certifi==2025.11.12 # via httpcore # via httpx -distro==1.8.0 +distro==1.9.0 # via codex-sdk -exceptiongroup==1.2.2 +exceptiongroup==1.3.1 # via anyio -frozenlist==1.6.2 +frozenlist==1.8.0 # via aiohttp # via aiosignal h11==0.16.0 @@ -45,31 +45,32 @@ httpx==0.28.1 # via httpx-aiohttp httpx-aiohttp==0.1.9 # via codex-sdk -idna==3.4 +idna==3.11 # via anyio # via httpx # via yarl -multidict==6.4.4 +multidict==6.7.0 # via aiohttp # via yarl -propcache==0.3.1 +propcache==0.4.1 # via aiohttp # via yarl -pydantic==2.11.9 +pydantic==2.12.5 # via codex-sdk -pydantic-core==2.33.2 +pydantic-core==2.41.5 # via pydantic -sniffio==1.3.0 - # via anyio +sniffio==1.3.1 # via codex-sdk -typing-extensions==4.12.2 +typing-extensions==4.15.0 + # via aiosignal # via anyio # via codex-sdk + # via exceptiongroup # via multidict # via pydantic # via pydantic-core # via typing-inspection -typing-inspection==0.4.1 +typing-inspection==0.4.2 # via pydantic -yarl==1.20.0 +yarl==1.22.0 # via aiohttp diff --git a/src/codex/_base_client.py b/src/codex/_base_client.py index e6febf3..1ce4a39 100644 --- a/src/codex/_base_client.py +++ b/src/codex/_base_client.py @@ -1247,9 +1247,12 @@ def patch( *, cast_to: Type[ResponseT], body: Body | None = None, + files: RequestFiles | None = None, options: RequestOptions = {}, ) -> ResponseT: - opts = FinalRequestOptions.construct(method="patch", url=path, json_data=body, **options) + opts = FinalRequestOptions.construct( + method="patch", url=path, json_data=body, files=to_httpx_files(files), **options + ) return self.request(cast_to, opts) def put( @@ -1767,9 +1770,12 @@ async def patch( *, cast_to: Type[ResponseT], body: Body | None = None, + files: RequestFiles | None = None, options: RequestOptions = {}, ) -> ResponseT: - opts = FinalRequestOptions.construct(method="patch", url=path, json_data=body, **options) + opts = FinalRequestOptions.construct( + method="patch", url=path, json_data=body, files=to_httpx_files(files), **options + ) return await self.request(cast_to, opts) async def put( diff --git a/src/codex/_client.py b/src/codex/_client.py index 308ce9a..1bfcb7e 100644 --- a/src/codex/_client.py +++ b/src/codex/_client.py @@ -3,7 +3,7 @@ from __future__ import annotations import os -from typing import Any, Dict, Mapping, cast +from typing import TYPE_CHECKING, Any, Dict, Mapping, cast from typing_extensions import Self, Literal, override import httpx @@ -21,8 +21,8 @@ not_given, ) from ._utils import is_given, get_async_library +from ._compat import cached_property from ._version import __version__ -from .resources import health from ._streaming import Stream as Stream, AsyncStream as AsyncStream from ._exceptions import APIStatusError from ._base_client import ( @@ -30,9 +30,13 @@ SyncAPIClient, AsyncAPIClient, ) -from .resources.users import users -from .resources.projects import projects -from .resources.organizations import organizations + +if TYPE_CHECKING: + from .resources import users, health, projects, organizations + from .resources.health import HealthResource, AsyncHealthResource + from .resources.users.users import UsersResource, AsyncUsersResource + from .resources.projects.projects import ProjectsResource, AsyncProjectsResource + from .resources.organizations.organizations import OrganizationsResource, AsyncOrganizationsResource __all__ = [ "ENVIRONMENTS", @@ -54,13 +58,6 @@ class Codex(SyncAPIClient): - health: health.HealthResource - organizations: organizations.OrganizationsResource - users: users.UsersResource - projects: projects.ProjectsResource - with_raw_response: CodexWithRawResponse - with_streaming_response: CodexWithStreamedResponse - # client options auth_token: str | None api_key: str | None @@ -138,12 +135,37 @@ def __init__( _strict_response_validation=_strict_response_validation, ) - self.health = health.HealthResource(self) - self.organizations = organizations.OrganizationsResource(self) - self.users = users.UsersResource(self) - self.projects = projects.ProjectsResource(self) - self.with_raw_response = CodexWithRawResponse(self) - self.with_streaming_response = CodexWithStreamedResponse(self) + @cached_property + def health(self) -> HealthResource: + from .resources.health import HealthResource + + return HealthResource(self) + + @cached_property + def organizations(self) -> OrganizationsResource: + from .resources.organizations import OrganizationsResource + + return OrganizationsResource(self) + + @cached_property + def users(self) -> UsersResource: + from .resources.users import UsersResource + + return UsersResource(self) + + @cached_property + def projects(self) -> ProjectsResource: + from .resources.projects import ProjectsResource + + return ProjectsResource(self) + + @cached_property + def with_raw_response(self) -> CodexWithRawResponse: + return CodexWithRawResponse(self) + + @cached_property + def with_streaming_response(self) -> CodexWithStreamedResponse: + return CodexWithStreamedResponse(self) @property @override @@ -298,13 +320,6 @@ def _make_status_error( class AsyncCodex(AsyncAPIClient): - health: health.AsyncHealthResource - organizations: organizations.AsyncOrganizationsResource - users: users.AsyncUsersResource - projects: projects.AsyncProjectsResource - with_raw_response: AsyncCodexWithRawResponse - with_streaming_response: AsyncCodexWithStreamedResponse - # client options auth_token: str | None api_key: str | None @@ -382,12 +397,37 @@ def __init__( _strict_response_validation=_strict_response_validation, ) - self.health = health.AsyncHealthResource(self) - self.organizations = organizations.AsyncOrganizationsResource(self) - self.users = users.AsyncUsersResource(self) - self.projects = projects.AsyncProjectsResource(self) - self.with_raw_response = AsyncCodexWithRawResponse(self) - self.with_streaming_response = AsyncCodexWithStreamedResponse(self) + @cached_property + def health(self) -> AsyncHealthResource: + from .resources.health import AsyncHealthResource + + return AsyncHealthResource(self) + + @cached_property + def organizations(self) -> AsyncOrganizationsResource: + from .resources.organizations import AsyncOrganizationsResource + + return AsyncOrganizationsResource(self) + + @cached_property + def users(self) -> AsyncUsersResource: + from .resources.users import AsyncUsersResource + + return AsyncUsersResource(self) + + @cached_property + def projects(self) -> AsyncProjectsResource: + from .resources.projects import AsyncProjectsResource + + return AsyncProjectsResource(self) + + @cached_property + def with_raw_response(self) -> AsyncCodexWithRawResponse: + return AsyncCodexWithRawResponse(self) + + @cached_property + def with_streaming_response(self) -> AsyncCodexWithStreamedResponse: + return AsyncCodexWithStreamedResponse(self) @property @override @@ -542,35 +582,127 @@ def _make_status_error( class CodexWithRawResponse: + _client: Codex + def __init__(self, client: Codex) -> None: - self.health = health.HealthResourceWithRawResponse(client.health) - self.organizations = organizations.OrganizationsResourceWithRawResponse(client.organizations) - self.users = users.UsersResourceWithRawResponse(client.users) - self.projects = projects.ProjectsResourceWithRawResponse(client.projects) + self._client = client + + @cached_property + def health(self) -> health.HealthResourceWithRawResponse: + from .resources.health import HealthResourceWithRawResponse + + return HealthResourceWithRawResponse(self._client.health) + + @cached_property + def organizations(self) -> organizations.OrganizationsResourceWithRawResponse: + from .resources.organizations import OrganizationsResourceWithRawResponse + + return OrganizationsResourceWithRawResponse(self._client.organizations) + + @cached_property + def users(self) -> users.UsersResourceWithRawResponse: + from .resources.users import UsersResourceWithRawResponse + + return UsersResourceWithRawResponse(self._client.users) + + @cached_property + def projects(self) -> projects.ProjectsResourceWithRawResponse: + from .resources.projects import ProjectsResourceWithRawResponse + + return ProjectsResourceWithRawResponse(self._client.projects) class AsyncCodexWithRawResponse: + _client: AsyncCodex + def __init__(self, client: AsyncCodex) -> None: - self.health = health.AsyncHealthResourceWithRawResponse(client.health) - self.organizations = organizations.AsyncOrganizationsResourceWithRawResponse(client.organizations) - self.users = users.AsyncUsersResourceWithRawResponse(client.users) - self.projects = projects.AsyncProjectsResourceWithRawResponse(client.projects) + self._client = client + + @cached_property + def health(self) -> health.AsyncHealthResourceWithRawResponse: + from .resources.health import AsyncHealthResourceWithRawResponse + + return AsyncHealthResourceWithRawResponse(self._client.health) + + @cached_property + def organizations(self) -> organizations.AsyncOrganizationsResourceWithRawResponse: + from .resources.organizations import AsyncOrganizationsResourceWithRawResponse + + return AsyncOrganizationsResourceWithRawResponse(self._client.organizations) + + @cached_property + def users(self) -> users.AsyncUsersResourceWithRawResponse: + from .resources.users import AsyncUsersResourceWithRawResponse + + return AsyncUsersResourceWithRawResponse(self._client.users) + + @cached_property + def projects(self) -> projects.AsyncProjectsResourceWithRawResponse: + from .resources.projects import AsyncProjectsResourceWithRawResponse + + return AsyncProjectsResourceWithRawResponse(self._client.projects) class CodexWithStreamedResponse: + _client: Codex + def __init__(self, client: Codex) -> None: - self.health = health.HealthResourceWithStreamingResponse(client.health) - self.organizations = organizations.OrganizationsResourceWithStreamingResponse(client.organizations) - self.users = users.UsersResourceWithStreamingResponse(client.users) - self.projects = projects.ProjectsResourceWithStreamingResponse(client.projects) + self._client = client + + @cached_property + def health(self) -> health.HealthResourceWithStreamingResponse: + from .resources.health import HealthResourceWithStreamingResponse + + return HealthResourceWithStreamingResponse(self._client.health) + + @cached_property + def organizations(self) -> organizations.OrganizationsResourceWithStreamingResponse: + from .resources.organizations import OrganizationsResourceWithStreamingResponse + + return OrganizationsResourceWithStreamingResponse(self._client.organizations) + + @cached_property + def users(self) -> users.UsersResourceWithStreamingResponse: + from .resources.users import UsersResourceWithStreamingResponse + + return UsersResourceWithStreamingResponse(self._client.users) + + @cached_property + def projects(self) -> projects.ProjectsResourceWithStreamingResponse: + from .resources.projects import ProjectsResourceWithStreamingResponse + + return ProjectsResourceWithStreamingResponse(self._client.projects) class AsyncCodexWithStreamedResponse: + _client: AsyncCodex + def __init__(self, client: AsyncCodex) -> None: - self.health = health.AsyncHealthResourceWithStreamingResponse(client.health) - self.organizations = organizations.AsyncOrganizationsResourceWithStreamingResponse(client.organizations) - self.users = users.AsyncUsersResourceWithStreamingResponse(client.users) - self.projects = projects.AsyncProjectsResourceWithStreamingResponse(client.projects) + self._client = client + + @cached_property + def health(self) -> health.AsyncHealthResourceWithStreamingResponse: + from .resources.health import AsyncHealthResourceWithStreamingResponse + + return AsyncHealthResourceWithStreamingResponse(self._client.health) + + @cached_property + def organizations(self) -> organizations.AsyncOrganizationsResourceWithStreamingResponse: + from .resources.organizations import AsyncOrganizationsResourceWithStreamingResponse + + return AsyncOrganizationsResourceWithStreamingResponse(self._client.organizations) + + @cached_property + def users(self) -> users.AsyncUsersResourceWithStreamingResponse: + from .resources.users import AsyncUsersResourceWithStreamingResponse + + return AsyncUsersResourceWithStreamingResponse(self._client.users) + + @cached_property + def projects(self) -> projects.AsyncProjectsResourceWithStreamingResponse: + from .resources.projects import AsyncProjectsResourceWithStreamingResponse + + return AsyncProjectsResourceWithStreamingResponse(self._client.projects) Client = Codex diff --git a/src/codex/_streaming.py b/src/codex/_streaming.py index d9c4a80..e6c997e 100644 --- a/src/codex/_streaming.py +++ b/src/codex/_streaming.py @@ -54,11 +54,12 @@ def __stream__(self) -> Iterator[_T]: process_data = self._client._process_response_data iterator = self._iter_events() - for sse in iterator: - yield process_data(data=sse.json(), cast_to=cast_to, response=response) - - # As we might not fully consume the response stream, we need to close it explicitly - response.close() + try: + for sse in iterator: + yield process_data(data=sse.json(), cast_to=cast_to, response=response) + finally: + # Ensure the response is closed even if the consumer doesn't read all data + response.close() def __enter__(self) -> Self: return self @@ -117,11 +118,12 @@ async def __stream__(self) -> AsyncIterator[_T]: process_data = self._client._process_response_data iterator = self._iter_events() - async for sse in iterator: - yield process_data(data=sse.json(), cast_to=cast_to, response=response) - - # As we might not fully consume the response stream, we need to close it explicitly - await response.aclose() + try: + async for sse in iterator: + yield process_data(data=sse.json(), cast_to=cast_to, response=response) + finally: + # Ensure the response is closed even if the consumer doesn't read all data + await response.aclose() async def __aenter__(self) -> Self: return self diff --git a/src/codex/_types.py b/src/codex/_types.py index 2e4695f..edc28a8 100644 --- a/src/codex/_types.py +++ b/src/codex/_types.py @@ -243,6 +243,9 @@ class HttpxSendArgs(TypedDict, total=False): if TYPE_CHECKING: # This works because str.__contains__ does not accept object (either in typeshed or at runtime) # https://github.com/hauntsaninja/useful_types/blob/5e9710f3875107d068e7679fd7fec9cfab0eff3b/useful_types/__init__.py#L285 + # + # Note: index() and count() methods are intentionally omitted to allow pyright to properly + # infer TypedDict types when dict literals are used in lists assigned to SequenceNotStr. class SequenceNotStr(Protocol[_T_co]): @overload def __getitem__(self, index: SupportsIndex, /) -> _T_co: ... @@ -251,8 +254,6 @@ def __getitem__(self, index: slice, /) -> Sequence[_T_co]: ... def __contains__(self, value: object, /) -> bool: ... def __len__(self) -> int: ... def __iter__(self) -> Iterator[_T_co]: ... - def index(self, value: Any, start: int = 0, stop: int = ..., /) -> int: ... - def count(self, value: Any, /) -> int: ... def __reversed__(self) -> Iterator[_T_co]: ... else: # just point this to a normal `Sequence` at runtime to avoid having to special case diff --git a/src/codex/_version.py b/src/codex/_version.py index c2ea81e..0bddca9 100644 --- a/src/codex/_version.py +++ b/src/codex/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "codex" -__version__ = "0.1.0-alpha.34" # x-release-please-version +__version__ = "0.1.0-alpha.35" # x-release-please-version diff --git a/src/codex/types/organization_list_members_response.py b/src/codex/types/organization_list_members_response.py index 1fa593e..f37c1f1 100644 --- a/src/codex/types/organization_list_members_response.py +++ b/src/codex/types/organization_list_members_response.py @@ -9,6 +9,8 @@ class OrganizationListMembersResponseItem(BaseModel): + """Schema for public organization member information.""" + email: str name: str diff --git a/src/codex/types/project_create_params.py b/src/codex/types/project_create_params.py index 4704f63..bd84b1c 100644 --- a/src/codex/types/project_create_params.py +++ b/src/codex/types/project_create_params.py @@ -39,6 +39,8 @@ class ProjectCreateParams(TypedDict, total=False): class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -56,6 +58,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False): + """A custom evaluation metric created by users. + + The TLMEvalSchema are mutable and stored in the database. + """ + criteria: Required[str] """ The evaluation criteria text that describes what aspect is being evaluated and @@ -120,10 +127,14 @@ class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False): class ConfigEvalConfigCustomEvals(TypedDict, total=False): + """Configuration for custom evaluation metrics.""" + evals: Dict[str, ConfigEvalConfigCustomEvalsEvals] class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -141,6 +152,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDic class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -179,6 +196,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -196,6 +215,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total= class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -234,6 +259,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -251,6 +278,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedD class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -289,6 +322,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -306,6 +341,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDi class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -344,6 +385,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -361,6 +404,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -399,6 +448,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False): class ConfigEvalConfigDefaultEvals(TypedDict, total=False): + """Configuration for default evaluation metrics.""" + context_sufficiency: ConfigEvalConfigDefaultEvalsContextSufficiency """A pre-configured evaluation metric from TrustworthyRAG or built into the system. @@ -436,6 +487,8 @@ class ConfigEvalConfigDefaultEvals(TypedDict, total=False): class ConfigEvalConfig(TypedDict, total=False): + """Configuration for project-specific evaluation metrics""" + custom_evals: ConfigEvalConfigCustomEvals """Configuration for custom evaluation metrics.""" diff --git a/src/codex/types/project_detect_params.py b/src/codex/types/project_detect_params.py index 8e93971..9cbf9bc 100644 --- a/src/codex/types/project_detect_params.py +++ b/src/codex/types/project_detect_params.py @@ -440,6 +440,8 @@ class ResponseChatCompletionTyped(TypedDict, total=False): class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -457,6 +459,11 @@ class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False): class EvalConfigCustomEvalsEvals(TypedDict, total=False): + """A custom evaluation metric created by users. + + The TLMEvalSchema are mutable and stored in the database. + """ + criteria: Required[str] """ The evaluation criteria text that describes what aspect is being evaluated and @@ -521,10 +528,14 @@ class EvalConfigCustomEvalsEvals(TypedDict, total=False): class EvalConfigCustomEvals(TypedDict, total=False): + """Configuration for custom evaluation metrics.""" + evals: Dict[str, EvalConfigCustomEvalsEvals] class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -542,6 +553,12 @@ class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, tot class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -580,6 +597,8 @@ class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False): class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -597,6 +616,12 @@ class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False) class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -635,6 +660,8 @@ class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False): class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -652,6 +679,12 @@ class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, t class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -690,6 +723,8 @@ class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False): class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -707,6 +742,12 @@ class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, to class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -745,6 +786,8 @@ class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False): class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -762,6 +805,12 @@ class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total= class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -800,6 +849,8 @@ class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False): class EvalConfigDefaultEvals(TypedDict, total=False): + """Configuration for default evaluation metrics.""" + context_sufficiency: EvalConfigDefaultEvalsContextSufficiency """A pre-configured evaluation metric from TrustworthyRAG or built into the system. @@ -837,6 +888,8 @@ class EvalConfigDefaultEvals(TypedDict, total=False): class EvalConfig(TypedDict, total=False): + """All of the evals that should be used for this query""" + custom_evals: EvalConfigCustomEvals """Configuration for custom evaluation metrics.""" @@ -1041,6 +1094,80 @@ class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False): class Options(TypedDict, total=False): + """ + Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the quality preset. + + For all options described below, higher settings will lead to longer runtimes and may consume more tokens internally. + You may not be able to run long prompts (or prompts with long responses) in your account, + unless your token/rate limits are increased. If you hit token limit issues, try lower/less expensive TLMOptions + to be able to run longer prompts/responses, or contact Cleanlab to increase your limits. + + The default values corresponding to each quality preset are: + - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, `reasoning_effort` = `"high"`. + - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, `reasoning_effort` = `"high"`. + - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"high"`. + - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"none"`. + - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, `reasoning_effort` = `"none"`. + + By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512. + You can set custom values for these arguments regardless of the quality preset specified. + + Args: + model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield better results, faster models yield faster results). + - Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku". + - Recommended models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0". + - Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro". + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring. + If you experience token/rate-limit errors, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Reduce this value to reduce runtimes. Higher values may improve trust scoring. + + num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring. + The maximum number currently supported is 3. Lower values can reduce runtimes. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis. + This parameter has no effect when `disable_trustworthiness` is True. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring. + Must be between 0 and 20. Lower values can reduce runtimes. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + This parameter has no effect when `disable_trustworthiness` is True. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include - "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes. + This parameter has no effect when `num_consistency_samples = 0`. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + You can auto-improve responses by increasing this parameter, but at higher runtimes/costs. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + This parameter has no effect when `disable_trustworthiness` is True. + + disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores, + useful if you only want to compute custom evaluation criteria. + """ + custom_eval_criteria: Iterable[object] disable_persistence: bool diff --git a/src/codex/types/project_detect_response.py b/src/codex/types/project_detect_response.py index df03c86..ff0d6ce 100644 --- a/src/codex/types/project_detect_response.py +++ b/src/codex/types/project_detect_response.py @@ -40,6 +40,10 @@ class EvalScores(BaseModel): class GuardrailedFallback(BaseModel): + """ + Name, fallback message, fallback priority, and fallback type of the triggered guardrail with the highest fallback priority + """ + message: str """ Fallback message to use if this eval fails and causes the response to be diff --git a/src/codex/types/project_list_response.py b/src/codex/types/project_list_response.py index e4ce558..d528e47 100644 --- a/src/codex/types/project_list_response.py +++ b/src/codex/types/project_list_response.py @@ -30,6 +30,8 @@ class ProjectConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -47,6 +49,11 @@ class ProjectConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel): class ProjectConfigEvalConfigCustomEvalsEvals(BaseModel): + """A custom evaluation metric created by users. + + The TLMEvalSchema are mutable and stored in the database. + """ + criteria: str """ The evaluation criteria text that describes what aspect is being evaluated and @@ -117,10 +124,14 @@ class ProjectConfigEvalConfigCustomEvalsEvals(BaseModel): class ProjectConfigEvalConfigCustomEvals(BaseModel): + """Configuration for custom evaluation metrics.""" + evals: Optional[Dict[str, ProjectConfigEvalConfigCustomEvalsEvals]] = None class ProjectConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -138,6 +149,12 @@ class ProjectConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(B class ProjectConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -182,6 +199,8 @@ class ProjectConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel): class ProjectConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -199,6 +218,12 @@ class ProjectConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel) class ProjectConfigEvalConfigDefaultEvalsQueryEase(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -243,6 +268,8 @@ class ProjectConfigEvalConfigDefaultEvalsQueryEase(BaseModel): class ProjectConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -260,6 +287,12 @@ class ProjectConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback class ProjectConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -304,6 +337,8 @@ class ProjectConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel): class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -321,6 +356,12 @@ class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback( class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -365,6 +406,8 @@ class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel): class ProjectConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -382,6 +425,12 @@ class ProjectConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(Base class ProjectConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -426,6 +475,8 @@ class ProjectConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel): class ProjectConfigEvalConfigDefaultEvals(BaseModel): + """Configuration for default evaluation metrics.""" + context_sufficiency: Optional[ProjectConfigEvalConfigDefaultEvalsContextSufficiency] = None """A pre-configured evaluation metric from TrustworthyRAG or built into the system. @@ -463,6 +514,8 @@ class ProjectConfigEvalConfigDefaultEvals(BaseModel): class ProjectConfigEvalConfig(BaseModel): + """Configuration for project-specific evaluation metrics""" + custom_evals: Optional[ProjectConfigEvalConfigCustomEvals] = None """Configuration for custom evaluation metrics.""" @@ -522,6 +575,8 @@ class Project(BaseModel): class Filters(BaseModel): + """Applied filters for the projects list request""" + query: Optional[str] = None diff --git a/src/codex/types/project_retrieve_response.py b/src/codex/types/project_retrieve_response.py index 8fe7741..abc05ad 100644 --- a/src/codex/types/project_retrieve_response.py +++ b/src/codex/types/project_retrieve_response.py @@ -28,6 +28,8 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -45,6 +47,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel): class ConfigEvalConfigCustomEvalsEvals(BaseModel): + """A custom evaluation metric created by users. + + The TLMEvalSchema are mutable and stored in the database. + """ + criteria: str """ The evaluation criteria text that describes what aspect is being evaluated and @@ -115,10 +122,14 @@ class ConfigEvalConfigCustomEvalsEvals(BaseModel): class ConfigEvalConfigCustomEvals(BaseModel): + """Configuration for custom evaluation metrics.""" + evals: Optional[Dict[str, ConfigEvalConfigCustomEvalsEvals]] = None class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -136,6 +147,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseMode class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -180,6 +197,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel): class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -197,6 +216,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel): class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -241,6 +266,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel): class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -258,6 +285,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseMo class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -302,6 +335,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel): class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -319,6 +354,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseMod class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -363,6 +404,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel): class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -380,6 +423,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel): class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -424,6 +473,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel): class ConfigEvalConfigDefaultEvals(BaseModel): + """Configuration for default evaluation metrics.""" + context_sufficiency: Optional[ConfigEvalConfigDefaultEvalsContextSufficiency] = None """A pre-configured evaluation metric from TrustworthyRAG or built into the system. @@ -461,6 +512,8 @@ class ConfigEvalConfigDefaultEvals(BaseModel): class ConfigEvalConfig(BaseModel): + """Configuration for project-specific evaluation metrics""" + custom_evals: Optional[ConfigEvalConfigCustomEvals] = None """Configuration for custom evaluation metrics.""" diff --git a/src/codex/types/project_return_schema.py b/src/codex/types/project_return_schema.py index 423d0ce..07a3a9b 100644 --- a/src/codex/types/project_return_schema.py +++ b/src/codex/types/project_return_schema.py @@ -28,6 +28,8 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -45,6 +47,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel): class ConfigEvalConfigCustomEvalsEvals(BaseModel): + """A custom evaluation metric created by users. + + The TLMEvalSchema are mutable and stored in the database. + """ + criteria: str """ The evaluation criteria text that describes what aspect is being evaluated and @@ -115,10 +122,14 @@ class ConfigEvalConfigCustomEvalsEvals(BaseModel): class ConfigEvalConfigCustomEvals(BaseModel): + """Configuration for custom evaluation metrics.""" + evals: Optional[Dict[str, ConfigEvalConfigCustomEvalsEvals]] = None class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -136,6 +147,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseMode class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -180,6 +197,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel): class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -197,6 +216,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel): class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -241,6 +266,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel): class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -258,6 +285,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseMo class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -302,6 +335,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel): class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -319,6 +354,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseMod class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -363,6 +404,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel): class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -380,6 +423,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel): class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + display_name: str """Human-friendly name for display. @@ -424,6 +473,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel): class ConfigEvalConfigDefaultEvals(BaseModel): + """Configuration for default evaluation metrics.""" + context_sufficiency: Optional[ConfigEvalConfigDefaultEvalsContextSufficiency] = None """A pre-configured evaluation metric from TrustworthyRAG or built into the system. @@ -461,6 +512,8 @@ class ConfigEvalConfigDefaultEvals(BaseModel): class ConfigEvalConfig(BaseModel): + """Configuration for project-specific evaluation metrics""" + custom_evals: Optional[ConfigEvalConfigCustomEvals] = None """Configuration for custom evaluation metrics.""" diff --git a/src/codex/types/project_update_params.py b/src/codex/types/project_update_params.py index 3557c2d..68cb0d3 100644 --- a/src/codex/types/project_update_params.py +++ b/src/codex/types/project_update_params.py @@ -37,6 +37,8 @@ class ProjectUpdateParams(TypedDict, total=False): class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -54,6 +56,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False): + """A custom evaluation metric created by users. + + The TLMEvalSchema are mutable and stored in the database. + """ + criteria: Required[str] """ The evaluation criteria text that describes what aspect is being evaluated and @@ -118,10 +125,14 @@ class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False): class ConfigEvalConfigCustomEvals(TypedDict, total=False): + """Configuration for custom evaluation metrics.""" + evals: Dict[str, ConfigEvalConfigCustomEvalsEvals] class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -139,6 +150,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDic class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -177,6 +194,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -194,6 +213,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total= class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -232,6 +257,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -249,6 +276,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedD class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -287,6 +320,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -304,6 +339,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDi class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -342,6 +383,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False): class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -359,6 +402,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False): + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, + while other properties like thresholds and priorities can be configured. + """ + eval_key: Required[str] """ Unique key for eval metric - currently maps to the TrustworthyRAG name property @@ -397,6 +446,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False): class ConfigEvalConfigDefaultEvals(TypedDict, total=False): + """Configuration for default evaluation metrics.""" + context_sufficiency: ConfigEvalConfigDefaultEvalsContextSufficiency """A pre-configured evaluation metric from TrustworthyRAG or built into the system. @@ -434,6 +485,8 @@ class ConfigEvalConfigDefaultEvals(TypedDict, total=False): class ConfigEvalConfig(TypedDict, total=False): + """Configuration for project-specific evaluation metrics""" + custom_evals: ConfigEvalConfigCustomEvals """Configuration for custom evaluation metrics.""" diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py index 0efa430..1ea5392 100644 --- a/src/codex/types/project_validate_params.py +++ b/src/codex/types/project_validate_params.py @@ -644,6 +644,80 @@ class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False): class Options(TypedDict, total=False): + """ + Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the quality preset. + + For all options described below, higher settings will lead to longer runtimes and may consume more tokens internally. + You may not be able to run long prompts (or prompts with long responses) in your account, + unless your token/rate limits are increased. If you hit token limit issues, try lower/less expensive TLMOptions + to be able to run longer prompts/responses, or contact Cleanlab to increase your limits. + + The default values corresponding to each quality preset are: + - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, `reasoning_effort` = `"high"`. + - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, `reasoning_effort` = `"high"`. + - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"high"`. + - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"none"`. + - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, `reasoning_effort` = `"none"`. + + By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512. + You can set custom values for these arguments regardless of the quality preset specified. + + Args: + model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield better results, faster models yield faster results). + - Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku". + - Recommended models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0". + - Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro". + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring. + If you experience token/rate-limit errors, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Reduce this value to reduce runtimes. Higher values may improve trust scoring. + + num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring. + The maximum number currently supported is 3. Lower values can reduce runtimes. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis. + This parameter has no effect when `disable_trustworthiness` is True. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring. + Must be between 0 and 20. Lower values can reduce runtimes. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + This parameter has no effect when `disable_trustworthiness` is True. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include - "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes. + This parameter has no effect when `num_consistency_samples = 0`. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + You can auto-improve responses by increasing this parameter, but at higher runtimes/costs. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + This parameter has no effect when `disable_trustworthiness` is True. + + disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores, + useful if you only want to compute custom evaluation criteria. + """ + custom_eval_criteria: Iterable[object] disable_persistence: bool diff --git a/src/codex/types/project_validate_response.py b/src/codex/types/project_validate_response.py index b9166c2..895db6f 100644 --- a/src/codex/types/project_validate_response.py +++ b/src/codex/types/project_validate_response.py @@ -74,6 +74,10 @@ class EvalScores(BaseModel): class GuardrailedFallback(BaseModel): + """ + Name, fallback message, fallback priority, and fallback type of the triggered guardrail with the highest fallback priority + """ + message: str """ Fallback message to use if this eval fails and causes the response to be diff --git a/src/codex/types/projects/eval_create_params.py b/src/codex/types/projects/eval_create_params.py index d4ec41e..d319f92 100644 --- a/src/codex/types/projects/eval_create_params.py +++ b/src/codex/types/projects/eval_create_params.py @@ -73,6 +73,8 @@ class EvalCreateParams(TypedDict, total=False): class GuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be diff --git a/src/codex/types/projects/eval_list_response.py b/src/codex/types/projects/eval_list_response.py index 2aa0d75..47bdd3d 100644 --- a/src/codex/types/projects/eval_list_response.py +++ b/src/codex/types/projects/eval_list_response.py @@ -9,6 +9,8 @@ class EvalGuardrailedFallback(BaseModel): + """message, priority, type""" + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -96,6 +98,8 @@ class Eval(BaseModel): class EvalListResponse(BaseModel): + """Schema for paginated evals response.""" + evals: List[Eval] total_count: int diff --git a/src/codex/types/projects/eval_update_params.py b/src/codex/types/projects/eval_update_params.py index 7da4e1e..87dc940 100644 --- a/src/codex/types/projects/eval_update_params.py +++ b/src/codex/types/projects/eval_update_params.py @@ -83,6 +83,8 @@ class CustomEvalCreateOrUpdateSchema(TypedDict, total=False): class CustomEvalCreateOrUpdateSchemaGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be @@ -137,6 +139,8 @@ class DefaultEvalUpdateSchema(TypedDict, total=False): class DefaultEvalUpdateSchemaGuardrailedFallback(TypedDict, total=False): + """message, priority, type""" + message: Required[str] """ Fallback message to use if this eval fails and causes the response to be diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py index 1a89baa..4638a3f 100644 --- a/src/codex/types/projects/query_log_list_by_group_response.py +++ b/src/codex/types/projects/query_log_list_by_group_response.py @@ -84,6 +84,12 @@ class QueryLogsByGroupQueryLogFormattedNonGuardrailEvalScores(BaseModel): class QueryLogsByGroupQueryLogContext(BaseModel): + """Represents a document in RAG contex. + + This schema is designed to be flexible while maintaining structure for RAG systems. + It supports both simple string content and rich document metadata. + """ + content: str """The actual content/text of the document.""" @@ -142,6 +148,10 @@ class QueryLogsByGroupQueryLogEvaluatedResponseToolCall(BaseModel): class QueryLogsByGroupQueryLogGuardrailedFallback(BaseModel): + """ + Name, fallback message, priority, and type for for the triggered guardrail with the highest priority + """ + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -467,12 +477,6 @@ class QueryLogsByGroupQueryLog(BaseModel): expert review. Expert review will override the original guardrail decision. """ - expert_override_log_id: Optional[str] = None - """ - ID of the query log with expert review that overrode the original guardrail - decision. - """ - expert_review_created_at: Optional[datetime] = None """When the expert review was created""" @@ -529,6 +533,15 @@ class QueryLogsByGroupQueryLog(BaseModel): primary_eval_issue_score: Optional[float] = None """Score of the primary eval issue""" + system_prompt: Optional[str] = None + """ + Content of the first system message associated with this query log, if + available. + """ + + system_prompt_hash: Optional[str] = None + """SHA-256 hash of the system prompt content for quick equality checks.""" + tools: Optional[List[QueryLogsByGroupQueryLogTool]] = None """Tools to use for the LLM call. @@ -543,6 +556,8 @@ class QueryLogsByGroup(BaseModel): class Filters(BaseModel): + """Applied filters for the query""" + custom_metadata_dict: Optional[object] = None created_at_end: Optional[datetime] = None diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py index fe70223..c5c2a4d 100644 --- a/src/codex/types/projects/query_log_list_groups_response.py +++ b/src/codex/types/projects/query_log_list_groups_response.py @@ -81,6 +81,12 @@ class FormattedNonGuardrailEvalScores(BaseModel): class Context(BaseModel): + """Represents a document in RAG contex. + + This schema is designed to be flexible while maintaining structure for RAG systems. + It supports both simple string content and rich document metadata. + """ + content: str """The actual content/text of the document.""" @@ -139,6 +145,10 @@ class EvaluatedResponseToolCall(BaseModel): class GuardrailedFallback(BaseModel): + """ + Name, fallback message, priority, and type for for the triggered guardrail with the highest priority + """ + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -462,12 +472,6 @@ class QueryLogListGroupsResponse(BaseModel): expert review. Expert review will override the original guardrail decision. """ - expert_override_log_id: Optional[str] = None - """ - ID of the query log with expert review that overrode the original guardrail - decision. - """ - expert_review_created_at: Optional[datetime] = None """When the expert review was created""" @@ -524,6 +528,15 @@ class QueryLogListGroupsResponse(BaseModel): primary_eval_issue_score: Optional[float] = None """Score of the primary eval issue""" + system_prompt: Optional[str] = None + """ + Content of the first system message associated with this query log, if + available. + """ + + system_prompt_hash: Optional[str] = None + """SHA-256 hash of the system prompt content for quick equality checks.""" + tools: Optional[List[Tool]] = None """Tools to use for the LLM call. diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py index dc7768f..b558081 100644 --- a/src/codex/types/projects/query_log_list_response.py +++ b/src/codex/types/projects/query_log_list_response.py @@ -81,6 +81,12 @@ class FormattedNonGuardrailEvalScores(BaseModel): class Context(BaseModel): + """Represents a document in RAG contex. + + This schema is designed to be flexible while maintaining structure for RAG systems. + It supports both simple string content and rich document metadata. + """ + content: str """The actual content/text of the document.""" @@ -139,6 +145,10 @@ class EvaluatedResponseToolCall(BaseModel): class GuardrailedFallback(BaseModel): + """ + Name, fallback message, priority, and type for for the triggered guardrail with the highest priority + """ + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -450,12 +460,6 @@ class QueryLogListResponse(BaseModel): expert review. Expert review will override the original guardrail decision. """ - expert_override_log_id: Optional[str] = None - """ - ID of the query log with expert review that overrode the original guardrail - decision. - """ - expert_review_created_at: Optional[datetime] = None """When the expert review was created""" @@ -509,6 +513,15 @@ class QueryLogListResponse(BaseModel): primary_eval_issue_score: Optional[float] = None """Score of the primary eval issue""" + system_prompt: Optional[str] = None + """ + Content of the first system message associated with this query log, if + available. + """ + + system_prompt_hash: Optional[str] = None + """SHA-256 hash of the system prompt content for quick equality checks.""" + tools: Optional[List[Tool]] = None """Tools to use for the LLM call. diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py index db91943..5df2108 100644 --- a/src/codex/types/projects/query_log_retrieve_response.py +++ b/src/codex/types/projects/query_log_retrieve_response.py @@ -81,6 +81,12 @@ class FormattedNonGuardrailEvalScores(BaseModel): class Context(BaseModel): + """Represents a document in RAG contex. + + This schema is designed to be flexible while maintaining structure for RAG systems. + It supports both simple string content and rich document metadata. + """ + content: str """The actual content/text of the document.""" @@ -139,6 +145,10 @@ class EvaluatedResponseToolCall(BaseModel): class GuardrailedFallback(BaseModel): + """ + Name, fallback message, priority, and type for for the triggered guardrail with the highest priority + """ + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -367,6 +377,8 @@ class QueryLogRetrieveResponse(BaseModel): expert_answer_id: Optional[str] = None + expert_override_log_id: Optional[str] = None + formatted_escalation_eval_scores: Optional[Dict[str, FormattedEscalationEvalScores]] = None formatted_eval_scores: Optional[Dict[str, FormattedEvalScores]] = None @@ -392,6 +404,8 @@ class QueryLogRetrieveResponse(BaseModel): issue_status: Literal["addressed", "unaddressed"] """Manual review status override for remediations.""" + log_needs_review: bool + needs_review: bool project_id: str @@ -457,12 +471,6 @@ class QueryLogRetrieveResponse(BaseModel): expert review. Expert review will override the original guardrail decision. """ - expert_override_log_id: Optional[str] = None - """ - ID of the query log with expert review that overrode the original guardrail - decision. - """ - expert_review_created_at: Optional[datetime] = None """When the expert review was created""" @@ -519,6 +527,15 @@ class QueryLogRetrieveResponse(BaseModel): primary_eval_issue_score: Optional[float] = None """Score of the primary eval issue""" + system_prompt: Optional[str] = None + """ + Content of the first system message associated with this query log, if + available. + """ + + system_prompt_hash: Optional[str] = None + """SHA-256 hash of the system prompt content for quick equality checks.""" + tools: Optional[List[Tool]] = None """Tools to use for the LLM call. diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py index 9f1b77b..d96d129 100644 --- a/src/codex/types/projects/remediation_list_resolved_logs_response.py +++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py @@ -82,6 +82,12 @@ class QueryLogFormattedNonGuardrailEvalScores(BaseModel): class QueryLogContext(BaseModel): + """Represents a document in RAG contex. + + This schema is designed to be flexible while maintaining structure for RAG systems. + It supports both simple string content and rich document metadata. + """ + content: str """The actual content/text of the document.""" @@ -140,6 +146,10 @@ class QueryLogEvaluatedResponseToolCall(BaseModel): class QueryLogGuardrailedFallback(BaseModel): + """ + Name, fallback message, priority, and type for for the triggered guardrail with the highest priority + """ + message: str """ Fallback message to use if this eval fails and causes the response to be @@ -457,12 +467,6 @@ class QueryLog(BaseModel): expert review. Expert review will override the original guardrail decision. """ - expert_override_log_id: Optional[str] = None - """ - ID of the query log with expert review that overrode the original guardrail - decision. - """ - expert_review_created_at: Optional[datetime] = None """When the expert review was created""" @@ -516,6 +520,15 @@ class QueryLog(BaseModel): primary_eval_issue_score: Optional[float] = None """Score of the primary eval issue""" + system_prompt: Optional[str] = None + """ + Content of the first system message associated with this query log, if + available. + """ + + system_prompt_hash: Optional[str] = None + """SHA-256 hash of the system prompt content for quick equality checks.""" + tools: Optional[List[QueryLogTool]] = None """Tools to use for the LLM call. diff --git a/src/codex/types/projects/remediations/expert_review_list_response.py b/src/codex/types/projects/remediations/expert_review_list_response.py index 99d26ab..eadb974 100644 --- a/src/codex/types/projects/remediations/expert_review_list_response.py +++ b/src/codex/types/projects/remediations/expert_review_list_response.py @@ -16,6 +16,8 @@ class ExpertReviewListResponse(BaseModel): evaluated_response: Optional[str] = None + expert_override_log_id: str + last_edited_at: datetime last_edited_by: Optional[str] = None diff --git a/src/codex/types/projects/remediations/expert_review_retrieve_response.py b/src/codex/types/projects/remediations/expert_review_retrieve_response.py index 9cb0da6..f48fac2 100644 --- a/src/codex/types/projects/remediations/expert_review_retrieve_response.py +++ b/src/codex/types/projects/remediations/expert_review_retrieve_response.py @@ -16,6 +16,8 @@ class ExpertReviewRetrieveResponse(BaseModel): evaluated_response: Optional[str] = None + expert_override_log_id: str + last_edited_at: datetime last_edited_by: Optional[str] = None