diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 36b2aff..f996350 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.1.0-alpha.34"
+  ".": "0.1.0-alpha.35"
 }
\ No newline at end of file
diff --git a/.stats.yml b/.stats.yml
index c021c17..7a6c349 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 70
-openapi_spec_hash: 9018ebfb2a9e1afa87058b3a4bd41b0b
+openapi_spec_hash: 11279400677011ad5dc1ebba33216ae4
 config_hash: aad16f20fed13ac50211fc1d0e2ea621
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7782cb1..95cf32e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,32 @@
 # Changelog
 
+## 0.1.0-alpha.35 (2025-12-17)
+
+Full Changelog: [v0.1.0-alpha.34...v0.1.0-alpha.35](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.34...v0.1.0-alpha.35)
+
+### Features
+
+* **api:** api update ([7f283d7](https://github.com/cleanlab/codex-python/commit/7f283d7abb4b9b79de86c88745fb66ea9943cdae))
+* **api:** api update ([7742c60](https://github.com/cleanlab/codex-python/commit/7742c60ecad518656a184513c5228a3447aa34c9))
+* **api:** api update ([94bacaf](https://github.com/cleanlab/codex-python/commit/94bacaf492809bc9bc15175d272de53ad2569895))
+* **api:** api update ([884de94](https://github.com/cleanlab/codex-python/commit/884de944e616b26580830817486bb85e74f1e7c4))
+
+
+### Bug Fixes
+
+* ensure streams are always closed ([2c971c4](https://github.com/cleanlab/codex-python/commit/2c971c4a93b0e407737648e83e555dc6c9b3a759))
+* **types:** allow pyright to infer TypedDict types within SequenceNotStr ([d64e474](https://github.com/cleanlab/codex-python/commit/d64e47443ef147240de6cba892e901dcab0b2d71))
+
+
+### Chores
+
+* add missing docstrings ([250433e](https://github.com/cleanlab/codex-python/commit/250433e37cb8ba034de2977ee6375f06390cc6c4))
+* add Python 3.14 classifier and testing ([4dec29c](https://github.com/cleanlab/codex-python/commit/4dec29cdf74dd3beeccf326678db7170156f0c44))
+* **deps:** mypy 1.18.1 has a regression, pin to 1.17 ([1828526](https://github.com/cleanlab/codex-python/commit/18285268b4eec848b2be2df65cdbdf960424f72d))
+* **internal:** add missing files argument to base client ([c8986ce](https://github.com/cleanlab/codex-python/commit/c8986ce9fa0eae5726ba6cb6692dfa11c60284f5))
+* speedup initial import ([9f17615](https://github.com/cleanlab/codex-python/commit/9f17615353be5ee705ea2f4713d9dc790b2ecb3b))
+* update lockfile ([230659a](https://github.com/cleanlab/codex-python/commit/230659a94b4921805c84224578df1324829e5d07))
+
 ## 0.1.0-alpha.34 (2025-11-19)
 
 Full Changelog: [v0.1.0-alpha.33...v0.1.0-alpha.34](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.33...v0.1.0-alpha.34)
diff --git a/pyproject.toml b/pyproject.toml
index 5fb2418..d65296d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,20 +1,22 @@
 [project]
 name = "codex-sdk"
-version = "0.1.0-alpha.34"
+version = "0.1.0-alpha.35"
 description = "Internal SDK used within cleanlab-codex package. Refer to https://pypi.org/project/cleanlab-codex/ instead."
 dynamic = ["readme"]
 license = "MIT"
 authors = [
 { name = "Cleanlab", email = "team@cleanlab.ai" },
 ]
+
 dependencies = [
-    "httpx>=0.23.0, <1",
-    "pydantic>=1.9.0, <3",
-    "typing-extensions>=4.10, <5",
-    "anyio>=3.5.0, <5",
-    "distro>=1.7.0, <2",
-    "sniffio",
+  "httpx>=0.23.0, <1",
+  "pydantic>=1.9.0, <3",
+  "typing-extensions>=4.10, <5",
+  "anyio>=3.5.0, <5",
+  "distro>=1.7.0, <2",
+  "sniffio",
 ]
+
 requires-python = ">= 3.9"
 classifiers = [
   "Typing :: Typed",
@@ -24,6 +26,7 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
   "Operating System :: OS Independent",
   "Operating System :: POSIX",
   "Operating System :: MacOS",
@@ -45,7 +48,7 @@ managed = true
 # version pins are in requirements-dev.lock
 dev-dependencies = [
     "pyright==1.1.399",
-    "mypy",
+    "mypy==1.17",
     "respx",
     "pytest",
     "pytest-asyncio",
diff --git a/requirements-dev.lock b/requirements-dev.lock
index d728372..90dc04b 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -12,40 +12,45 @@
 -e file:.
 aiohappyeyeballs==2.6.1
     # via aiohttp
-aiohttp==3.12.8
+aiohttp==3.13.2
     # via codex-sdk
     # via httpx-aiohttp
-aiosignal==1.3.2
+aiosignal==1.4.0
     # via aiohttp
-annotated-types==0.6.0
+annotated-types==0.7.0
     # via pydantic
-anyio==4.4.0
+anyio==4.12.0
     # via codex-sdk
     # via httpx
-argcomplete==3.1.2
+argcomplete==3.6.3
     # via nox
 async-timeout==5.0.1
     # via aiohttp
-attrs==25.3.0
+attrs==25.4.0
     # via aiohttp
-certifi==2023.7.22
+    # via nox
+backports-asyncio-runner==1.2.0
+    # via pytest-asyncio
+certifi==2025.11.12
     # via httpcore
     # via httpx
-colorlog==6.7.0
+colorlog==6.10.1
+    # via nox
+dependency-groups==1.3.1
     # via nox
-dirty-equals==0.6.0
-distlib==0.3.7
+dirty-equals==0.11
+distlib==0.4.0
     # via virtualenv
-distro==1.8.0
+distro==1.9.0
     # via codex-sdk
-exceptiongroup==1.2.2
+exceptiongroup==1.3.1
     # via anyio
     # via pytest
-execnet==2.1.1
+execnet==2.1.2
     # via pytest-xdist
-filelock==3.12.4
+filelock==3.19.1
     # via virtualenv
-frozenlist==1.6.2
+frozenlist==1.8.0
     # via aiohttp
     # via aiosignal
 h11==0.16.0
@@ -58,80 +63,87 @@ httpx==0.28.1
     # via respx
 httpx-aiohttp==0.1.9
     # via codex-sdk
-idna==3.4
+humanize==4.13.0
+    # via nox
+idna==3.11
     # via anyio
     # via httpx
     # via yarl
-importlib-metadata==7.0.0
-iniconfig==2.0.0
+importlib-metadata==8.7.0
+iniconfig==2.1.0
     # via pytest
 markdown-it-py==3.0.0
     # via rich
 mdurl==0.1.2
     # via markdown-it-py
-multidict==6.4.4
+multidict==6.7.0
     # via aiohttp
     # via yarl
-mypy==1.14.1
-mypy-extensions==1.0.0
+mypy==1.17.0
+mypy-extensions==1.1.0
     # via mypy
-nodeenv==1.8.0
+nodeenv==1.9.1
     # via pyright
-nox==2023.4.22
-packaging==23.2
+nox==2025.11.12
+packaging==25.0
+    # via dependency-groups
     # via nox
     # via pytest
-platformdirs==3.11.0
+pathspec==0.12.1
+    # via mypy
+platformdirs==4.4.0
     # via virtualenv
-pluggy==1.5.0
+pluggy==1.6.0
     # via pytest
-propcache==0.3.1
+propcache==0.4.1
     # via aiohttp
     # via yarl
-pydantic==2.11.9
+pydantic==2.12.5
     # via codex-sdk
-pydantic-core==2.33.2
+pydantic-core==2.41.5
     # via pydantic
-pygments==2.18.0
+pygments==2.19.2
+    # via pytest
     # via rich
 pyright==1.1.399
-pytest==8.3.3
+pytest==8.4.2
     # via pytest-asyncio
     # via pytest-xdist
-pytest-asyncio==0.24.0
-pytest-xdist==3.7.0
-python-dateutil==2.8.2
+pytest-asyncio==1.2.0
+pytest-xdist==3.8.0
+python-dateutil==2.9.0.post0
     # via time-machine
-pytz==2023.3.post1
-    # via dirty-equals
 respx==0.22.0
-rich==13.7.1
-ruff==0.9.4
-setuptools==68.2.2
-    # via nodeenv
-six==1.16.0
+rich==14.2.0
+ruff==0.14.7
+six==1.17.0
     # via python-dateutil
-sniffio==1.3.0
-    # via anyio
+sniffio==1.3.1
     # via codex-sdk
-time-machine==2.9.0
-tomli==2.0.2
+time-machine==2.19.0
+tomli==2.3.0
+    # via dependency-groups
     # via mypy
+    # via nox
     # via pytest
-typing-extensions==4.12.2
+typing-extensions==4.15.0
+    # via aiosignal
     # via anyio
     # via codex-sdk
+    # via exceptiongroup
     # via multidict
     # via mypy
     # via pydantic
     # via pydantic-core
     # via pyright
+    # via pytest-asyncio
     # via typing-inspection
-typing-inspection==0.4.1
+    # via virtualenv
+typing-inspection==0.4.2
     # via pydantic
-virtualenv==20.24.5
+virtualenv==20.35.4
     # via nox
-yarl==1.20.0
+yarl==1.22.0
     # via aiohttp
-zipp==3.17.0
+zipp==3.23.0
     # via importlib-metadata
diff --git a/requirements.lock b/requirements.lock
index 4b916da..2cedc49 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -12,28 +12,28 @@
 -e file:.
 aiohappyeyeballs==2.6.1
     # via aiohttp
-aiohttp==3.12.8
+aiohttp==3.13.2
     # via codex-sdk
     # via httpx-aiohttp
-aiosignal==1.3.2
+aiosignal==1.4.0
     # via aiohttp
-annotated-types==0.6.0
+annotated-types==0.7.0
     # via pydantic
-anyio==4.4.0
+anyio==4.12.0
     # via codex-sdk
     # via httpx
 async-timeout==5.0.1
     # via aiohttp
-attrs==25.3.0
+attrs==25.4.0
     # via aiohttp
-certifi==2023.7.22
+certifi==2025.11.12
     # via httpcore
     # via httpx
-distro==1.8.0
+distro==1.9.0
     # via codex-sdk
-exceptiongroup==1.2.2
+exceptiongroup==1.3.1
     # via anyio
-frozenlist==1.6.2
+frozenlist==1.8.0
     # via aiohttp
     # via aiosignal
 h11==0.16.0
@@ -45,31 +45,32 @@ httpx==0.28.1
     # via httpx-aiohttp
 httpx-aiohttp==0.1.9
     # via codex-sdk
-idna==3.4
+idna==3.11
     # via anyio
     # via httpx
     # via yarl
-multidict==6.4.4
+multidict==6.7.0
     # via aiohttp
     # via yarl
-propcache==0.3.1
+propcache==0.4.1
     # via aiohttp
     # via yarl
-pydantic==2.11.9
+pydantic==2.12.5
     # via codex-sdk
-pydantic-core==2.33.2
+pydantic-core==2.41.5
     # via pydantic
-sniffio==1.3.0
-    # via anyio
+sniffio==1.3.1
     # via codex-sdk
-typing-extensions==4.12.2
+typing-extensions==4.15.0
+    # via aiosignal
     # via anyio
     # via codex-sdk
+    # via exceptiongroup
     # via multidict
     # via pydantic
     # via pydantic-core
     # via typing-inspection
-typing-inspection==0.4.1
+typing-inspection==0.4.2
     # via pydantic
-yarl==1.20.0
+yarl==1.22.0
     # via aiohttp
diff --git a/src/codex/_base_client.py b/src/codex/_base_client.py
index e6febf3..1ce4a39 100644
--- a/src/codex/_base_client.py
+++ b/src/codex/_base_client.py
@@ -1247,9 +1247,12 @@ def patch(
         *,
         cast_to: Type[ResponseT],
         body: Body | None = None,
+        files: RequestFiles | None = None,
         options: RequestOptions = {},
     ) -> ResponseT:
-        opts = FinalRequestOptions.construct(method="patch", url=path, json_data=body, **options)
+        opts = FinalRequestOptions.construct(
+            method="patch", url=path, json_data=body, files=to_httpx_files(files), **options
+        )
         return self.request(cast_to, opts)
 
     def put(
@@ -1767,9 +1770,12 @@ async def patch(
         *,
         cast_to: Type[ResponseT],
         body: Body | None = None,
+        files: RequestFiles | None = None,
         options: RequestOptions = {},
     ) -> ResponseT:
-        opts = FinalRequestOptions.construct(method="patch", url=path, json_data=body, **options)
+        opts = FinalRequestOptions.construct(
+            method="patch", url=path, json_data=body, files=to_httpx_files(files), **options
+        )
         return await self.request(cast_to, opts)
 
     async def put(
diff --git a/src/codex/_client.py b/src/codex/_client.py
index 308ce9a..1bfcb7e 100644
--- a/src/codex/_client.py
+++ b/src/codex/_client.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import os
-from typing import Any, Dict, Mapping, cast
+from typing import TYPE_CHECKING, Any, Dict, Mapping, cast
 from typing_extensions import Self, Literal, override
 
 import httpx
@@ -21,8 +21,8 @@
     not_given,
 )
 from ._utils import is_given, get_async_library
+from ._compat import cached_property
 from ._version import __version__
-from .resources import health
 from ._streaming import Stream as Stream, AsyncStream as AsyncStream
 from ._exceptions import APIStatusError
 from ._base_client import (
@@ -30,9 +30,13 @@
     SyncAPIClient,
     AsyncAPIClient,
 )
-from .resources.users import users
-from .resources.projects import projects
-from .resources.organizations import organizations
+
+if TYPE_CHECKING:
+    from .resources import users, health, projects, organizations
+    from .resources.health import HealthResource, AsyncHealthResource
+    from .resources.users.users import UsersResource, AsyncUsersResource
+    from .resources.projects.projects import ProjectsResource, AsyncProjectsResource
+    from .resources.organizations.organizations import OrganizationsResource, AsyncOrganizationsResource
 
 __all__ = [
     "ENVIRONMENTS",
@@ -54,13 +58,6 @@
 
 
 class Codex(SyncAPIClient):
-    health: health.HealthResource
-    organizations: organizations.OrganizationsResource
-    users: users.UsersResource
-    projects: projects.ProjectsResource
-    with_raw_response: CodexWithRawResponse
-    with_streaming_response: CodexWithStreamedResponse
-
     # client options
     auth_token: str | None
     api_key: str | None
@@ -138,12 +135,37 @@ def __init__(
             _strict_response_validation=_strict_response_validation,
         )
 
-        self.health = health.HealthResource(self)
-        self.organizations = organizations.OrganizationsResource(self)
-        self.users = users.UsersResource(self)
-        self.projects = projects.ProjectsResource(self)
-        self.with_raw_response = CodexWithRawResponse(self)
-        self.with_streaming_response = CodexWithStreamedResponse(self)
+    @cached_property
+    def health(self) -> HealthResource:
+        from .resources.health import HealthResource
+
+        return HealthResource(self)
+
+    @cached_property
+    def organizations(self) -> OrganizationsResource:
+        from .resources.organizations import OrganizationsResource
+
+        return OrganizationsResource(self)
+
+    @cached_property
+    def users(self) -> UsersResource:
+        from .resources.users import UsersResource
+
+        return UsersResource(self)
+
+    @cached_property
+    def projects(self) -> ProjectsResource:
+        from .resources.projects import ProjectsResource
+
+        return ProjectsResource(self)
+
+    @cached_property
+    def with_raw_response(self) -> CodexWithRawResponse:
+        return CodexWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> CodexWithStreamedResponse:
+        return CodexWithStreamedResponse(self)
 
     @property
     @override
@@ -298,13 +320,6 @@ def _make_status_error(
 
 
 class AsyncCodex(AsyncAPIClient):
-    health: health.AsyncHealthResource
-    organizations: organizations.AsyncOrganizationsResource
-    users: users.AsyncUsersResource
-    projects: projects.AsyncProjectsResource
-    with_raw_response: AsyncCodexWithRawResponse
-    with_streaming_response: AsyncCodexWithStreamedResponse
-
     # client options
     auth_token: str | None
     api_key: str | None
@@ -382,12 +397,37 @@ def __init__(
             _strict_response_validation=_strict_response_validation,
         )
 
-        self.health = health.AsyncHealthResource(self)
-        self.organizations = organizations.AsyncOrganizationsResource(self)
-        self.users = users.AsyncUsersResource(self)
-        self.projects = projects.AsyncProjectsResource(self)
-        self.with_raw_response = AsyncCodexWithRawResponse(self)
-        self.with_streaming_response = AsyncCodexWithStreamedResponse(self)
+    @cached_property
+    def health(self) -> AsyncHealthResource:
+        from .resources.health import AsyncHealthResource
+
+        return AsyncHealthResource(self)
+
+    @cached_property
+    def organizations(self) -> AsyncOrganizationsResource:
+        from .resources.organizations import AsyncOrganizationsResource
+
+        return AsyncOrganizationsResource(self)
+
+    @cached_property
+    def users(self) -> AsyncUsersResource:
+        from .resources.users import AsyncUsersResource
+
+        return AsyncUsersResource(self)
+
+    @cached_property
+    def projects(self) -> AsyncProjectsResource:
+        from .resources.projects import AsyncProjectsResource
+
+        return AsyncProjectsResource(self)
+
+    @cached_property
+    def with_raw_response(self) -> AsyncCodexWithRawResponse:
+        return AsyncCodexWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> AsyncCodexWithStreamedResponse:
+        return AsyncCodexWithStreamedResponse(self)
 
     @property
     @override
@@ -542,35 +582,127 @@ def _make_status_error(
 
 
 class CodexWithRawResponse:
+    _client: Codex
+
     def __init__(self, client: Codex) -> None:
-        self.health = health.HealthResourceWithRawResponse(client.health)
-        self.organizations = organizations.OrganizationsResourceWithRawResponse(client.organizations)
-        self.users = users.UsersResourceWithRawResponse(client.users)
-        self.projects = projects.ProjectsResourceWithRawResponse(client.projects)
+        self._client = client
+
+    @cached_property
+    def health(self) -> health.HealthResourceWithRawResponse:
+        from .resources.health import HealthResourceWithRawResponse
+
+        return HealthResourceWithRawResponse(self._client.health)
+
+    @cached_property
+    def organizations(self) -> organizations.OrganizationsResourceWithRawResponse:
+        from .resources.organizations import OrganizationsResourceWithRawResponse
+
+        return OrganizationsResourceWithRawResponse(self._client.organizations)
+
+    @cached_property
+    def users(self) -> users.UsersResourceWithRawResponse:
+        from .resources.users import UsersResourceWithRawResponse
+
+        return UsersResourceWithRawResponse(self._client.users)
+
+    @cached_property
+    def projects(self) -> projects.ProjectsResourceWithRawResponse:
+        from .resources.projects import ProjectsResourceWithRawResponse
+
+        return ProjectsResourceWithRawResponse(self._client.projects)
 
 
 class AsyncCodexWithRawResponse:
+    _client: AsyncCodex
+
     def __init__(self, client: AsyncCodex) -> None:
-        self.health = health.AsyncHealthResourceWithRawResponse(client.health)
-        self.organizations = organizations.AsyncOrganizationsResourceWithRawResponse(client.organizations)
-        self.users = users.AsyncUsersResourceWithRawResponse(client.users)
-        self.projects = projects.AsyncProjectsResourceWithRawResponse(client.projects)
+        self._client = client
+
+    @cached_property
+    def health(self) -> health.AsyncHealthResourceWithRawResponse:
+        from .resources.health import AsyncHealthResourceWithRawResponse
+
+        return AsyncHealthResourceWithRawResponse(self._client.health)
+
+    @cached_property
+    def organizations(self) -> organizations.AsyncOrganizationsResourceWithRawResponse:
+        from .resources.organizations import AsyncOrganizationsResourceWithRawResponse
+
+        return AsyncOrganizationsResourceWithRawResponse(self._client.organizations)
+
+    @cached_property
+    def users(self) -> users.AsyncUsersResourceWithRawResponse:
+        from .resources.users import AsyncUsersResourceWithRawResponse
+
+        return AsyncUsersResourceWithRawResponse(self._client.users)
+
+    @cached_property
+    def projects(self) -> projects.AsyncProjectsResourceWithRawResponse:
+        from .resources.projects import AsyncProjectsResourceWithRawResponse
+
+        return AsyncProjectsResourceWithRawResponse(self._client.projects)
 
 
 class CodexWithStreamedResponse:
+    _client: Codex
+
     def __init__(self, client: Codex) -> None:
-        self.health = health.HealthResourceWithStreamingResponse(client.health)
-        self.organizations = organizations.OrganizationsResourceWithStreamingResponse(client.organizations)
-        self.users = users.UsersResourceWithStreamingResponse(client.users)
-        self.projects = projects.ProjectsResourceWithStreamingResponse(client.projects)
+        self._client = client
+
+    @cached_property
+    def health(self) -> health.HealthResourceWithStreamingResponse:
+        from .resources.health import HealthResourceWithStreamingResponse
+
+        return HealthResourceWithStreamingResponse(self._client.health)
+
+    @cached_property
+    def organizations(self) -> organizations.OrganizationsResourceWithStreamingResponse:
+        from .resources.organizations import OrganizationsResourceWithStreamingResponse
+
+        return OrganizationsResourceWithStreamingResponse(self._client.organizations)
+
+    @cached_property
+    def users(self) -> users.UsersResourceWithStreamingResponse:
+        from .resources.users import UsersResourceWithStreamingResponse
+
+        return UsersResourceWithStreamingResponse(self._client.users)
+
+    @cached_property
+    def projects(self) -> projects.ProjectsResourceWithStreamingResponse:
+        from .resources.projects import ProjectsResourceWithStreamingResponse
+
+        return ProjectsResourceWithStreamingResponse(self._client.projects)
 
 
 class AsyncCodexWithStreamedResponse:
+    _client: AsyncCodex
+
     def __init__(self, client: AsyncCodex) -> None:
-        self.health = health.AsyncHealthResourceWithStreamingResponse(client.health)
-        self.organizations = organizations.AsyncOrganizationsResourceWithStreamingResponse(client.organizations)
-        self.users = users.AsyncUsersResourceWithStreamingResponse(client.users)
-        self.projects = projects.AsyncProjectsResourceWithStreamingResponse(client.projects)
+        self._client = client
+
+    @cached_property
+    def health(self) -> health.AsyncHealthResourceWithStreamingResponse:
+        from .resources.health import AsyncHealthResourceWithStreamingResponse
+
+        return AsyncHealthResourceWithStreamingResponse(self._client.health)
+
+    @cached_property
+    def organizations(self) -> organizations.AsyncOrganizationsResourceWithStreamingResponse:
+        from .resources.organizations import AsyncOrganizationsResourceWithStreamingResponse
+
+        return AsyncOrganizationsResourceWithStreamingResponse(self._client.organizations)
+
+    @cached_property
+    def users(self) -> users.AsyncUsersResourceWithStreamingResponse:
+        from .resources.users import AsyncUsersResourceWithStreamingResponse
+
+        return AsyncUsersResourceWithStreamingResponse(self._client.users)
+
+    @cached_property
+    def projects(self) -> projects.AsyncProjectsResourceWithStreamingResponse:
+        from .resources.projects import AsyncProjectsResourceWithStreamingResponse
+
+        return AsyncProjectsResourceWithStreamingResponse(self._client.projects)
 
 
 Client = Codex
diff --git a/src/codex/_streaming.py b/src/codex/_streaming.py
index d9c4a80..e6c997e 100644
--- a/src/codex/_streaming.py
+++ b/src/codex/_streaming.py
@@ -54,11 +54,12 @@ def __stream__(self) -> Iterator[_T]:
         process_data = self._client._process_response_data
         iterator = self._iter_events()
 
-        for sse in iterator:
-            yield process_data(data=sse.json(), cast_to=cast_to, response=response)
-
-        # As we might not fully consume the response stream, we need to close it explicitly
-        response.close()
+        try:
+            for sse in iterator:
+                yield process_data(data=sse.json(), cast_to=cast_to, response=response)
+        finally:
+            # Ensure the response is closed even if the consumer doesn't read all data
+            response.close()
 
     def __enter__(self) -> Self:
         return self
@@ -117,11 +118,12 @@ async def __stream__(self) -> AsyncIterator[_T]:
         process_data = self._client._process_response_data
         iterator = self._iter_events()
 
-        async for sse in iterator:
-            yield process_data(data=sse.json(), cast_to=cast_to, response=response)
-
-        # As we might not fully consume the response stream, we need to close it explicitly
-        await response.aclose()
+        try:
+            async for sse in iterator:
+                yield process_data(data=sse.json(), cast_to=cast_to, response=response)
+        finally:
+            # Ensure the response is closed even if the consumer doesn't read all data
+            await response.aclose()
 
     async def __aenter__(self) -> Self:
         return self
diff --git a/src/codex/_types.py b/src/codex/_types.py
index 2e4695f..edc28a8 100644
--- a/src/codex/_types.py
+++ b/src/codex/_types.py
@@ -243,6 +243,9 @@ class HttpxSendArgs(TypedDict, total=False):
 if TYPE_CHECKING:
     # This works because str.__contains__ does not accept object (either in typeshed or at runtime)
     # https://github.com/hauntsaninja/useful_types/blob/5e9710f3875107d068e7679fd7fec9cfab0eff3b/useful_types/__init__.py#L285
+    #
+    # Note: index() and count() methods are intentionally omitted to allow pyright to properly
+    # infer TypedDict types when dict literals are used in lists assigned to SequenceNotStr.
     class SequenceNotStr(Protocol[_T_co]):
         @overload
         def __getitem__(self, index: SupportsIndex, /) -> _T_co: ...
@@ -251,8 +254,6 @@ def __getitem__(self, index: slice, /) -> Sequence[_T_co]: ...
         def __contains__(self, value: object, /) -> bool: ...
         def __len__(self) -> int: ...
         def __iter__(self) -> Iterator[_T_co]: ...
-        def index(self, value: Any, start: int = 0, stop: int = ..., /) -> int: ...
-        def count(self, value: Any, /) -> int: ...
         def __reversed__(self) -> Iterator[_T_co]: ...
 else:
     # just point this to a normal `Sequence` at runtime to avoid having to special case
diff --git a/src/codex/_version.py b/src/codex/_version.py
index c2ea81e..0bddca9 100644
--- a/src/codex/_version.py
+++ b/src/codex/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "codex"
-__version__ = "0.1.0-alpha.34"  # x-release-please-version
+__version__ = "0.1.0-alpha.35"  # x-release-please-version
diff --git a/src/codex/types/organization_list_members_response.py b/src/codex/types/organization_list_members_response.py
index 1fa593e..f37c1f1 100644
--- a/src/codex/types/organization_list_members_response.py
+++ b/src/codex/types/organization_list_members_response.py
@@ -9,6 +9,8 @@
 
 
 class OrganizationListMembersResponseItem(BaseModel):
+    """Schema for public organization member information."""
+
     email: str
 
     name: str
diff --git a/src/codex/types/project_create_params.py b/src/codex/types/project_create_params.py
index 4704f63..bd84b1c 100644
--- a/src/codex/types/project_create_params.py
+++ b/src/codex/types/project_create_params.py
@@ -39,6 +39,8 @@ class ProjectCreateParams(TypedDict, total=False):
 
 
 class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -56,6 +58,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False
 
 
 class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: Required[str]
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -120,10 +127,14 @@ class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
 
 
 class ConfigEvalConfigCustomEvals(TypedDict, total=False):
+    """Configuration for custom evaluation metrics."""
+
     evals: Dict[str, ConfigEvalConfigCustomEvalsEvals]
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -141,6 +152,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDic
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -179,6 +196,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -196,6 +215,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=
 
 
 class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -234,6 +259,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -251,6 +278,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedD
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -289,6 +322,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -306,6 +341,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDi
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -344,6 +385,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -361,6 +404,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict,
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -399,6 +448,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: ConfigEvalConfigDefaultEvalsContextSufficiency
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -436,6 +487,8 @@ class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
 
 
 class ConfigEvalConfig(TypedDict, total=False):
+    """Configuration for project-specific evaluation metrics"""
+
     custom_evals: ConfigEvalConfigCustomEvals
     """Configuration for custom evaluation metrics."""
 
diff --git a/src/codex/types/project_detect_params.py b/src/codex/types/project_detect_params.py
index 8e93971..9cbf9bc 100644
--- a/src/codex/types/project_detect_params.py
+++ b/src/codex/types/project_detect_params.py
@@ -440,6 +440,8 @@ class ResponseChatCompletionTyped(TypedDict, total=False):
 
 
 class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -457,6 +459,11 @@ class EvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
 
 
 class EvalConfigCustomEvalsEvals(TypedDict, total=False):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: Required[str]
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -521,10 +528,14 @@ class EvalConfigCustomEvalsEvals(TypedDict, total=False):
 
 
 class EvalConfigCustomEvals(TypedDict, total=False):
+    """Configuration for custom evaluation metrics."""
+
     evals: Dict[str, EvalConfigCustomEvalsEvals]
 
 
 class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -542,6 +553,12 @@ class EvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, tot
 
 
 class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -580,6 +597,8 @@ class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -597,6 +616,12 @@ class EvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False)
 
 
 class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -635,6 +660,8 @@ class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -652,6 +679,12 @@ class EvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, t
 
 
 class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -690,6 +723,8 @@ class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -707,6 +742,12 @@ class EvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, to
 
 
 class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -745,6 +786,8 @@ class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -762,6 +805,12 @@ class EvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=
 
 
 class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -800,6 +849,8 @@ class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
 
 
 class EvalConfigDefaultEvals(TypedDict, total=False):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: EvalConfigDefaultEvalsContextSufficiency
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -837,6 +888,8 @@ class EvalConfigDefaultEvals(TypedDict, total=False):
 
 
 class EvalConfig(TypedDict, total=False):
+    """All of the evals that should be used for this query"""
+
     custom_evals: EvalConfigCustomEvals
     """Configuration for custom evaluation metrics."""
 
@@ -1041,6 +1094,80 @@ class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False):
 
 
 class Options(TypedDict, total=False):
+    """
+    Typed dict of advanced configuration options for the Trustworthy Language Model.
+    Many of these configurations are determined by the quality preset selected
+    (learn about quality presets in the TLM [initialization method](./#class-tlm)).
+    Specifying TLMOptions values directly overrides any default values set from the quality preset.
+
+    For all options described below, higher settings will lead to longer runtimes and may consume more tokens internally.
+    You may not be able to run long prompts (or prompts with long responses) in your account,
+    unless your token/rate limits are increased. If you hit token limit issues, try lower/less expensive TLMOptions
+    to be able to run longer prompts/responses, or contact Cleanlab to increase your limits.
+
+    The default values corresponding to each quality preset are:
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, `reasoning_effort` = `"none"`.
+
+    By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512.
+    You can set custom values for these arguments regardless of the quality preset specified.
+
+    Args:
+        model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o",          "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",           "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"):         Underlying base LLM to use (better models yield better results, faster models yield faster results).
+        - Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku".
+        - Recommended models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0".
+        - Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro".
+
+        log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+        For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+        custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+        The expected input format is a list of dictionaries, where each dictionary has the following keys:
+        - name: Name of the evaluation criteria.
+        - criteria: Instructions specifying the evaluation criteria.
+
+        max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring.
+        If you experience token/rate-limit errors, try lowering this number.
+        For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
+
+        reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+        when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+        Reduce this value to reduce runtimes. Higher values may improve trust scoring.
+
+        num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring.
+        The maximum number currently supported is 3. Lower values can reduce runtimes.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring.
+        Must be between 0 and 20. Lower values can reduce runtimes.
+        Measuring consistency helps quantify the epistemic uncertainty associated with
+        strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
+        TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+        trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+        Supported similarity measures include - "semantic" (based on natural language inference),
+        "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+        "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+        and "string" (based on character/word overlap). Set this to "string" for minimal runtimes.
+        This parameter has no effect when `num_consistency_samples = 0`.
+
+        num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+        `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+        You can auto-improve responses by increasing this parameter, but at higher runtimes/costs.
+        This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+        When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores,
+        useful if you only want to compute custom evaluation criteria.
+    """
+
     custom_eval_criteria: Iterable[object]
 
     disable_persistence: bool
diff --git a/src/codex/types/project_detect_response.py b/src/codex/types/project_detect_response.py
index df03c86..ff0d6ce 100644
--- a/src/codex/types/project_detect_response.py
+++ b/src/codex/types/project_detect_response.py
@@ -40,6 +40,10 @@ class EvalScores(BaseModel):
 
 
 class GuardrailedFallback(BaseModel):
+    """
+    Name, fallback message, fallback priority, and fallback type of the triggered guardrail with the highest fallback priority
+    """
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
diff --git a/src/codex/types/project_list_response.py b/src/codex/types/project_list_response.py
index e4ce558..d528e47 100644
--- a/src/codex/types/project_list_response.py
+++ b/src/codex/types/project_list_response.py
@@ -30,6 +30,8 @@
 
 
 class ProjectConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -47,6 +49,11 @@ class ProjectConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel):
 
 
 class ProjectConfigEvalConfigCustomEvalsEvals(BaseModel):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: str
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -117,10 +124,14 @@ class ProjectConfigEvalConfigCustomEvalsEvals(BaseModel):
 
 
 class ProjectConfigEvalConfigCustomEvals(BaseModel):
+    """Configuration for custom evaluation metrics."""
+
     evals: Optional[Dict[str, ProjectConfigEvalConfigCustomEvalsEvals]] = None
 
 
 class ProjectConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -138,6 +149,12 @@ class ProjectConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(B
 
 
 class ProjectConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -182,6 +199,8 @@ class ProjectConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel):
 
 
 class ProjectConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -199,6 +218,12 @@ class ProjectConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel)
 
 
 class ProjectConfigEvalConfigDefaultEvalsQueryEase(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -243,6 +268,8 @@ class ProjectConfigEvalConfigDefaultEvalsQueryEase(BaseModel):
 
 
 class ProjectConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -260,6 +287,12 @@ class ProjectConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback
 
 
 class ProjectConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -304,6 +337,8 @@ class ProjectConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel):
 
 
 class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -321,6 +356,12 @@ class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(
 
 
 class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -365,6 +406,8 @@ class ProjectConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel):
 
 
 class ProjectConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -382,6 +425,12 @@ class ProjectConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(Base
 
 
 class ProjectConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -426,6 +475,8 @@ class ProjectConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel):
 
 
 class ProjectConfigEvalConfigDefaultEvals(BaseModel):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: Optional[ProjectConfigEvalConfigDefaultEvalsContextSufficiency] = None
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -463,6 +514,8 @@ class ProjectConfigEvalConfigDefaultEvals(BaseModel):
 
 
 class ProjectConfigEvalConfig(BaseModel):
+    """Configuration for project-specific evaluation metrics"""
+
     custom_evals: Optional[ProjectConfigEvalConfigCustomEvals] = None
     """Configuration for custom evaluation metrics."""
 
@@ -522,6 +575,8 @@ class Project(BaseModel):
 
 
 class Filters(BaseModel):
+    """Applied filters for the projects list request"""
+
     query: Optional[str] = None
 
 
diff --git a/src/codex/types/project_retrieve_response.py b/src/codex/types/project_retrieve_response.py
index 8fe7741..abc05ad 100644
--- a/src/codex/types/project_retrieve_response.py
+++ b/src/codex/types/project_retrieve_response.py
@@ -28,6 +28,8 @@
 
 
 class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -45,6 +47,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel):
 
 
 class ConfigEvalConfigCustomEvalsEvals(BaseModel):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: str
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -115,10 +122,14 @@ class ConfigEvalConfigCustomEvalsEvals(BaseModel):
 
 
 class ConfigEvalConfigCustomEvals(BaseModel):
+    """Configuration for custom evaluation metrics."""
+
     evals: Optional[Dict[str, ConfigEvalConfigCustomEvalsEvals]] = None
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -136,6 +147,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseMode
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -180,6 +197,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -197,6 +216,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -241,6 +266,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -258,6 +285,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseMo
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -302,6 +335,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -319,6 +354,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseMod
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -363,6 +404,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -380,6 +423,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -424,6 +473,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvals(BaseModel):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: Optional[ConfigEvalConfigDefaultEvalsContextSufficiency] = None
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -461,6 +512,8 @@ class ConfigEvalConfigDefaultEvals(BaseModel):
 
 
 class ConfigEvalConfig(BaseModel):
+    """Configuration for project-specific evaluation metrics"""
+
     custom_evals: Optional[ConfigEvalConfigCustomEvals] = None
     """Configuration for custom evaluation metrics."""
 
diff --git a/src/codex/types/project_return_schema.py b/src/codex/types/project_return_schema.py
index 423d0ce..07a3a9b 100644
--- a/src/codex/types/project_return_schema.py
+++ b/src/codex/types/project_return_schema.py
@@ -28,6 +28,8 @@
 
 
 class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -45,6 +47,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(BaseModel):
 
 
 class ConfigEvalConfigCustomEvalsEvals(BaseModel):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: str
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -115,10 +122,14 @@ class ConfigEvalConfigCustomEvalsEvals(BaseModel):
 
 
 class ConfigEvalConfigCustomEvals(BaseModel):
+    """Configuration for custom evaluation metrics."""
+
     evals: Optional[Dict[str, ConfigEvalConfigCustomEvalsEvals]] = None
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -136,6 +147,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(BaseMode
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -180,6 +197,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -197,6 +216,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -241,6 +266,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -258,6 +285,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(BaseMo
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -302,6 +335,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -319,6 +354,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(BaseMod
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -363,6 +404,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -380,6 +423,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     display_name: str
     """Human-friendly name for display.
 
@@ -424,6 +473,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(BaseModel):
 
 
 class ConfigEvalConfigDefaultEvals(BaseModel):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: Optional[ConfigEvalConfigDefaultEvalsContextSufficiency] = None
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -461,6 +512,8 @@ class ConfigEvalConfigDefaultEvals(BaseModel):
 
 
 class ConfigEvalConfig(BaseModel):
+    """Configuration for project-specific evaluation metrics"""
+
     custom_evals: Optional[ConfigEvalConfigCustomEvals] = None
     """Configuration for custom evaluation metrics."""
 
diff --git a/src/codex/types/project_update_params.py b/src/codex/types/project_update_params.py
index 3557c2d..68cb0d3 100644
--- a/src/codex/types/project_update_params.py
+++ b/src/codex/types/project_update_params.py
@@ -37,6 +37,8 @@ class ProjectUpdateParams(TypedDict, total=False):
 
 
 class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -54,6 +56,11 @@ class ConfigEvalConfigCustomEvalsEvalsGuardrailedFallback(TypedDict, total=False
 
 
 class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
+    """A custom evaluation metric created by users.
+
+    The TLMEvalSchema are mutable and stored in the database.
+    """
+
     criteria: Required[str]
     """
     The evaluation criteria text that describes what aspect is being evaluated and
@@ -118,10 +125,14 @@ class ConfigEvalConfigCustomEvalsEvals(TypedDict, total=False):
 
 
 class ConfigEvalConfigCustomEvals(TypedDict, total=False):
+    """Configuration for custom evaluation metrics."""
+
     evals: Dict[str, ConfigEvalConfigCustomEvalsEvals]
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -139,6 +150,12 @@ class ConfigEvalConfigDefaultEvalsContextSufficiencyGuardrailedFallback(TypedDic
 
 
 class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -177,6 +194,8 @@ class ConfigEvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -194,6 +213,12 @@ class ConfigEvalConfigDefaultEvalsQueryEaseGuardrailedFallback(TypedDict, total=
 
 
 class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -232,6 +257,8 @@ class ConfigEvalConfigDefaultEvalsQueryEase(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -249,6 +276,12 @@ class ConfigEvalConfigDefaultEvalsResponseGroundednessGuardrailedFallback(TypedD
 
 
 class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -287,6 +320,8 @@ class ConfigEvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -304,6 +339,12 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulnessGuardrailedFallback(TypedDi
 
 
 class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -342,6 +383,8 @@ class ConfigEvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -359,6 +402,12 @@ class ConfigEvalConfigDefaultEvalsTrustworthinessGuardrailedFallback(TypedDict,
 
 
 class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
+    """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
+
+    The evaluation criteria and identifiers are immutable and system-managed,
+    while other properties like thresholds and priorities can be configured.
+    """
+
     eval_key: Required[str]
     """
     Unique key for eval metric - currently maps to the TrustworthyRAG name property
@@ -397,6 +446,8 @@ class ConfigEvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False):
 
 
 class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
+    """Configuration for default evaluation metrics."""
+
     context_sufficiency: ConfigEvalConfigDefaultEvalsContextSufficiency
     """A pre-configured evaluation metric from TrustworthyRAG or built into the system.
 
@@ -434,6 +485,8 @@ class ConfigEvalConfigDefaultEvals(TypedDict, total=False):
 
 
 class ConfigEvalConfig(TypedDict, total=False):
+    """Configuration for project-specific evaluation metrics"""
+
     custom_evals: ConfigEvalConfigCustomEvals
     """Configuration for custom evaluation metrics."""
 
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
index 0efa430..1ea5392 100644
--- a/src/codex/types/project_validate_params.py
+++ b/src/codex/types/project_validate_params.py
@@ -644,6 +644,80 @@ class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False):
 
 
 class Options(TypedDict, total=False):
+    """
+    Typed dict of advanced configuration options for the Trustworthy Language Model.
+    Many of these configurations are determined by the quality preset selected
+    (learn about quality presets in the TLM [initialization method](./#class-tlm)).
+    Specifying TLMOptions values directly overrides any default values set from the quality preset.
+
+    For all options described below, higher settings will lead to longer runtimes and may consume more tokens internally.
+    You may not be able to run long prompts (or prompts with long responses) in your account,
+    unless your token/rate limits are increased. If you hit token limit issues, try lower/less expensive TLMOptions
+    to be able to run longer prompts/responses, or contact Cleanlab to increase your limits.
+
+    The default values corresponding to each quality preset are:
+    - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"high"`.
+    - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, `reasoning_effort` = `"none"`.
+    - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, `reasoning_effort` = `"none"`.
+
+    By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base `model`, and `max_tokens` is set to 512.
+    You can set custom values for these arguments regardless of the quality preset specified.
+
+    Args:
+        model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o",          "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",           "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"):         Underlying base LLM to use (better models yield better results, faster models yield faster results).
+        - Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-haiku".
+        - Recommended models for accuracy: "gpt-5", "gpt-4.1", "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0".
+        - Recommended models for low latency/costs: "gpt-4.1-nano", "nova-micro".
+
+        log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
+        For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
+
+        custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
+        The expected input format is a list of dictionaries, where each dictionary has the following keys:
+        - name: Name of the evaluation criteria.
+        - criteria: Instructions specifying the evaluation criteria.
+
+        max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring.
+        If you experience token/rate-limit errors, try lowering this number.
+        For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
+
+        reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
+        when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
+        Reduce this value to reduce runtimes. Higher values may improve trust scoring.
+
+        num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring.
+        The maximum number currently supported is 3. Lower values can reduce runtimes.
+        Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring.
+        Must be between 0 and 20. Lower values can reduce runtimes.
+        Measuring consistency helps quantify the epistemic uncertainty associated with
+        strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
+        TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+        trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
+        Supported similarity measures include - "semantic" (based on natural language inference),
+        "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
+        "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
+        and "string" (based on character/word overlap). Set this to "string" for minimal runtimes.
+        This parameter has no effect when `num_consistency_samples = 0`.
+
+        num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
+        `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
+        You can auto-improve responses by increasing this parameter, but at higher runtimes/costs.
+        This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
+        When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
+        This parameter has no effect when `disable_trustworthiness` is True.
+
+        disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores,
+        useful if you only want to compute custom evaluation criteria.
+    """
+
     custom_eval_criteria: Iterable[object]
 
     disable_persistence: bool
diff --git a/src/codex/types/project_validate_response.py b/src/codex/types/project_validate_response.py
index b9166c2..895db6f 100644
--- a/src/codex/types/project_validate_response.py
+++ b/src/codex/types/project_validate_response.py
@@ -74,6 +74,10 @@ class EvalScores(BaseModel):
 
 
 class GuardrailedFallback(BaseModel):
+    """
+    Name, fallback message, fallback priority, and fallback type of the triggered guardrail with the highest fallback priority
+    """
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
diff --git a/src/codex/types/projects/eval_create_params.py b/src/codex/types/projects/eval_create_params.py
index d4ec41e..d319f92 100644
--- a/src/codex/types/projects/eval_create_params.py
+++ b/src/codex/types/projects/eval_create_params.py
@@ -73,6 +73,8 @@ class EvalCreateParams(TypedDict, total=False):
 
 
 class GuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
diff --git a/src/codex/types/projects/eval_list_response.py b/src/codex/types/projects/eval_list_response.py
index 2aa0d75..47bdd3d 100644
--- a/src/codex/types/projects/eval_list_response.py
+++ b/src/codex/types/projects/eval_list_response.py
@@ -9,6 +9,8 @@
 
 
 class EvalGuardrailedFallback(BaseModel):
+    """message, priority, type"""
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -96,6 +98,8 @@ class Eval(BaseModel):
 
 
 class EvalListResponse(BaseModel):
+    """Schema for paginated evals response."""
+
     evals: List[Eval]
 
     total_count: int
diff --git a/src/codex/types/projects/eval_update_params.py b/src/codex/types/projects/eval_update_params.py
index 7da4e1e..87dc940 100644
--- a/src/codex/types/projects/eval_update_params.py
+++ b/src/codex/types/projects/eval_update_params.py
@@ -83,6 +83,8 @@ class CustomEvalCreateOrUpdateSchema(TypedDict, total=False):
 
 
 class CustomEvalCreateOrUpdateSchemaGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -137,6 +139,8 @@ class DefaultEvalUpdateSchema(TypedDict, total=False):
 
 
 class DefaultEvalUpdateSchemaGuardrailedFallback(TypedDict, total=False):
+    """message, priority, type"""
+
     message: Required[str]
     """
     Fallback message to use if this eval fails and causes the response to be
diff --git a/src/codex/types/projects/query_log_list_by_group_response.py b/src/codex/types/projects/query_log_list_by_group_response.py
index 1a89baa..4638a3f 100644
--- a/src/codex/types/projects/query_log_list_by_group_response.py
+++ b/src/codex/types/projects/query_log_list_by_group_response.py
@@ -84,6 +84,12 @@ class QueryLogsByGroupQueryLogFormattedNonGuardrailEvalScores(BaseModel):
 
 
 class QueryLogsByGroupQueryLogContext(BaseModel):
+    """Represents a document in RAG contex.
+
+    This schema is designed to be flexible while maintaining structure for RAG systems.
+    It supports both simple string content and rich document metadata.
+    """
+
     content: str
     """The actual content/text of the document."""
 
@@ -142,6 +148,10 @@ class QueryLogsByGroupQueryLogEvaluatedResponseToolCall(BaseModel):
 
 
 class QueryLogsByGroupQueryLogGuardrailedFallback(BaseModel):
+    """
+    Name, fallback message, priority, and type for for the triggered guardrail with the highest priority
+    """
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -467,12 +477,6 @@ class QueryLogsByGroupQueryLog(BaseModel):
     expert review. Expert review will override the original guardrail decision.
     """
 
-    expert_override_log_id: Optional[str] = None
-    """
-    ID of the query log with expert review that overrode the original guardrail
-    decision.
-    """
-
     expert_review_created_at: Optional[datetime] = None
     """When the expert review was created"""
 
@@ -529,6 +533,15 @@ class QueryLogsByGroupQueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    system_prompt: Optional[str] = None
+    """
+    Content of the first system message associated with this query log, if
+    available.
+    """
+
+    system_prompt_hash: Optional[str] = None
+    """SHA-256 hash of the system prompt content for quick equality checks."""
+
     tools: Optional[List[QueryLogsByGroupQueryLogTool]] = None
     """Tools to use for the LLM call.
 
@@ -543,6 +556,8 @@ class QueryLogsByGroup(BaseModel):
 
 
 class Filters(BaseModel):
+    """Applied filters for the query"""
+
     custom_metadata_dict: Optional[object] = None
 
     created_at_end: Optional[datetime] = None
diff --git a/src/codex/types/projects/query_log_list_groups_response.py b/src/codex/types/projects/query_log_list_groups_response.py
index fe70223..c5c2a4d 100644
--- a/src/codex/types/projects/query_log_list_groups_response.py
+++ b/src/codex/types/projects/query_log_list_groups_response.py
@@ -81,6 +81,12 @@ class FormattedNonGuardrailEvalScores(BaseModel):
 
 
 class Context(BaseModel):
+    """Represents a document in RAG contex.
+
+    This schema is designed to be flexible while maintaining structure for RAG systems.
+    It supports both simple string content and rich document metadata.
+    """
+
     content: str
     """The actual content/text of the document."""
 
@@ -139,6 +145,10 @@ class EvaluatedResponseToolCall(BaseModel):
 
 
 class GuardrailedFallback(BaseModel):
+    """
+    Name, fallback message, priority, and type for for the triggered guardrail with the highest priority
+    """
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -462,12 +472,6 @@ class QueryLogListGroupsResponse(BaseModel):
     expert review. Expert review will override the original guardrail decision.
     """
 
-    expert_override_log_id: Optional[str] = None
-    """
-    ID of the query log with expert review that overrode the original guardrail
-    decision.
-    """
-
     expert_review_created_at: Optional[datetime] = None
     """When the expert review was created"""
 
@@ -524,6 +528,15 @@ class QueryLogListGroupsResponse(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    system_prompt: Optional[str] = None
+    """
+    Content of the first system message associated with this query log, if
+    available.
+    """
+
+    system_prompt_hash: Optional[str] = None
+    """SHA-256 hash of the system prompt content for quick equality checks."""
+
     tools: Optional[List[Tool]] = None
     """Tools to use for the LLM call.
 
diff --git a/src/codex/types/projects/query_log_list_response.py b/src/codex/types/projects/query_log_list_response.py
index dc7768f..b558081 100644
--- a/src/codex/types/projects/query_log_list_response.py
+++ b/src/codex/types/projects/query_log_list_response.py
@@ -81,6 +81,12 @@ class FormattedNonGuardrailEvalScores(BaseModel):
 
 
 class Context(BaseModel):
+    """Represents a document in RAG contex.
+
+    This schema is designed to be flexible while maintaining structure for RAG systems.
+    It supports both simple string content and rich document metadata.
+    """
+
     content: str
     """The actual content/text of the document."""
 
@@ -139,6 +145,10 @@ class EvaluatedResponseToolCall(BaseModel):
 
 
 class GuardrailedFallback(BaseModel):
+    """
+    Name, fallback message, priority, and type for for the triggered guardrail with the highest priority
+    """
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -450,12 +460,6 @@ class QueryLogListResponse(BaseModel):
     expert review. Expert review will override the original guardrail decision.
     """
 
-    expert_override_log_id: Optional[str] = None
-    """
-    ID of the query log with expert review that overrode the original guardrail
-    decision.
-    """
-
     expert_review_created_at: Optional[datetime] = None
     """When the expert review was created"""
 
@@ -509,6 +513,15 @@ class QueryLogListResponse(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    system_prompt: Optional[str] = None
+    """
+    Content of the first system message associated with this query log, if
+    available.
+    """
+
+    system_prompt_hash: Optional[str] = None
+    """SHA-256 hash of the system prompt content for quick equality checks."""
+
     tools: Optional[List[Tool]] = None
     """Tools to use for the LLM call.
 
diff --git a/src/codex/types/projects/query_log_retrieve_response.py b/src/codex/types/projects/query_log_retrieve_response.py
index db91943..5df2108 100644
--- a/src/codex/types/projects/query_log_retrieve_response.py
+++ b/src/codex/types/projects/query_log_retrieve_response.py
@@ -81,6 +81,12 @@ class FormattedNonGuardrailEvalScores(BaseModel):
 
 
 class Context(BaseModel):
+    """Represents a document in RAG contex.
+
+    This schema is designed to be flexible while maintaining structure for RAG systems.
+    It supports both simple string content and rich document metadata.
+    """
+
     content: str
     """The actual content/text of the document."""
 
@@ -139,6 +145,10 @@ class EvaluatedResponseToolCall(BaseModel):
 
 
 class GuardrailedFallback(BaseModel):
+    """
+    Name, fallback message, priority, and type for for the triggered guardrail with the highest priority
+    """
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -367,6 +377,8 @@ class QueryLogRetrieveResponse(BaseModel):
 
     expert_answer_id: Optional[str] = None
 
+    expert_override_log_id: Optional[str] = None
+
     formatted_escalation_eval_scores: Optional[Dict[str, FormattedEscalationEvalScores]] = None
 
     formatted_eval_scores: Optional[Dict[str, FormattedEvalScores]] = None
@@ -392,6 +404,8 @@ class QueryLogRetrieveResponse(BaseModel):
     issue_status: Literal["addressed", "unaddressed"]
     """Manual review status override for remediations."""
 
+    log_needs_review: bool
+
     needs_review: bool
 
     project_id: str
@@ -457,12 +471,6 @@ class QueryLogRetrieveResponse(BaseModel):
     expert review. Expert review will override the original guardrail decision.
     """
 
-    expert_override_log_id: Optional[str] = None
-    """
-    ID of the query log with expert review that overrode the original guardrail
-    decision.
-    """
-
     expert_review_created_at: Optional[datetime] = None
     """When the expert review was created"""
 
@@ -519,6 +527,15 @@ class QueryLogRetrieveResponse(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    system_prompt: Optional[str] = None
+    """
+    Content of the first system message associated with this query log, if
+    available.
+    """
+
+    system_prompt_hash: Optional[str] = None
+    """SHA-256 hash of the system prompt content for quick equality checks."""
+
     tools: Optional[List[Tool]] = None
     """Tools to use for the LLM call.
 
diff --git a/src/codex/types/projects/remediation_list_resolved_logs_response.py b/src/codex/types/projects/remediation_list_resolved_logs_response.py
index 9f1b77b..d96d129 100644
--- a/src/codex/types/projects/remediation_list_resolved_logs_response.py
+++ b/src/codex/types/projects/remediation_list_resolved_logs_response.py
@@ -82,6 +82,12 @@ class QueryLogFormattedNonGuardrailEvalScores(BaseModel):
 
 
 class QueryLogContext(BaseModel):
+    """Represents a document in RAG contex.
+
+    This schema is designed to be flexible while maintaining structure for RAG systems.
+    It supports both simple string content and rich document metadata.
+    """
+
     content: str
     """The actual content/text of the document."""
 
@@ -140,6 +146,10 @@ class QueryLogEvaluatedResponseToolCall(BaseModel):
 
 
 class QueryLogGuardrailedFallback(BaseModel):
+    """
+    Name, fallback message, priority, and type for for the triggered guardrail with the highest priority
+    """
+
     message: str
     """
     Fallback message to use if this eval fails and causes the response to be
@@ -457,12 +467,6 @@ class QueryLog(BaseModel):
     expert review. Expert review will override the original guardrail decision.
     """
 
-    expert_override_log_id: Optional[str] = None
-    """
-    ID of the query log with expert review that overrode the original guardrail
-    decision.
-    """
-
     expert_review_created_at: Optional[datetime] = None
     """When the expert review was created"""
 
@@ -516,6 +520,15 @@ class QueryLog(BaseModel):
     primary_eval_issue_score: Optional[float] = None
     """Score of the primary eval issue"""
 
+    system_prompt: Optional[str] = None
+    """
+    Content of the first system message associated with this query log, if
+    available.
+    """
+
+    system_prompt_hash: Optional[str] = None
+    """SHA-256 hash of the system prompt content for quick equality checks."""
+
     tools: Optional[List[QueryLogTool]] = None
     """Tools to use for the LLM call.
 
diff --git a/src/codex/types/projects/remediations/expert_review_list_response.py b/src/codex/types/projects/remediations/expert_review_list_response.py
index 99d26ab..eadb974 100644
--- a/src/codex/types/projects/remediations/expert_review_list_response.py
+++ b/src/codex/types/projects/remediations/expert_review_list_response.py
@@ -16,6 +16,8 @@ class ExpertReviewListResponse(BaseModel):
 
     evaluated_response: Optional[str] = None
 
+    expert_override_log_id: str
+
     last_edited_at: datetime
 
     last_edited_by: Optional[str] = None
diff --git a/src/codex/types/projects/remediations/expert_review_retrieve_response.py b/src/codex/types/projects/remediations/expert_review_retrieve_response.py
index 9cb0da6..f48fac2 100644
--- a/src/codex/types/projects/remediations/expert_review_retrieve_response.py
+++ b/src/codex/types/projects/remediations/expert_review_retrieve_response.py
@@ -16,6 +16,8 @@ class ExpertReviewRetrieveResponse(BaseModel):
 
     evaluated_response: Optional[str] = None
 
+    expert_override_log_id: str
+
     last_edited_at: datetime
 
     last_edited_by: Optional[str] = None