From 45765abf2c2b8f8be1627444a7771f5eeba5f9a3 Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Tue, 16 Jun 2026 19:05:00 -0700 Subject: [PATCH 1/2] feat: add togetherai plugin --- README.md | 1 + plugins/togetherai/LICENSE.togetherai-skills | 21 + plugins/togetherai/README.md | 37 + plugins/togetherai/index.ts | 24 + plugins/togetherai/package.json | 20 + .../togetherai/skills/together-audio/SKILL.md | 84 ++ .../together-audio/references/stt-models.md | 302 +++++++ .../together-audio/references/tts-models.md | 227 +++++ .../together-audio/scripts/stt_realtime.py | 125 +++ .../together-audio/scripts/stt_transcribe.py | 221 +++++ .../together-audio/scripts/stt_transcribe.ts | 194 ++++ .../together-audio/scripts/tts_generate.py | 267 ++++++ .../together-audio/scripts/tts_generate.ts | 203 +++++ .../together-audio/scripts/tts_websocket.py | 137 +++ .../skills/together-batch-inference/SKILL.md | 70 ++ .../references/api-reference.md | 311 +++++++ .../scripts/batch_workflow.py | 170 ++++ .../scripts/batch_workflow.ts | 118 +++ .../skills/together-chat-completions/SKILL.md | 116 +++ .../references/api-parameters.md | 474 ++++++++++ .../references/function-calling-patterns.md | 826 ++++++++++++++++++ .../references/models.md | 58 ++ .../references/reasoning-models.md | 423 +++++++++ .../references/structured-outputs.md | 542 ++++++++++++ .../scripts/async_parallel.py | 50 ++ .../scripts/chat_basic.py | 76 ++ .../scripts/chat_basic.ts | 77 ++ .../scripts/debug_headers.py | 57 ++ .../scripts/debug_headers.ts | 64 ++ .../scripts/reasoning_models.py | 150 ++++ .../scripts/reasoning_models.ts | 162 ++++ .../scripts/structured_outputs.py | 142 +++ .../scripts/structured_outputs.ts | 159 ++++ .../scripts/tool_call_loop.py | 119 +++ .../scripts/tool_call_loop.ts | 154 ++++ .../together-dedicated-containers/SKILL.md | 70 ++ .../references/jig-cli.md | 526 +++++++++++ .../references/sprocket-sdk.md | 189 ++++ .../scripts/queue_client.py | 109 +++ .../scripts/queue_client.ts | 124 +++ .../scripts/sprocket_hello_world.py | 85 ++ .../together-dedicated-endpoints/SKILL.md | 80 ++ .../references/api-reference.md | 583 ++++++++++++ .../references/dedicated-models.md | 150 ++++ .../references/hardware-options.md | 165 ++++ .../scripts/deploy_finetuned.py | 196 +++++ .../scripts/manage_endpoint.py | 211 +++++ .../scripts/manage_endpoint.ts | 129 +++ .../scripts/upload_custom_model.py | 132 +++ .../skills/together-embeddings/SKILL.md | 75 ++ .../references/api-reference.md | 174 ++++ .../together-embeddings/references/models.md | 93 ++ .../scripts/embed_and_rerank.py | 163 ++++ .../scripts/embed_and_rerank.ts | 68 ++ .../scripts/rag_pipeline.py | 152 ++++ .../scripts/semantic_search.py | 161 ++++ .../skills/together-evaluations/SKILL.md | 81 ++ .../references/api-reference.md | 758 ++++++++++++++++ .../scripts/run_evaluation.py | 467 ++++++++++ .../scripts/run_evaluation.ts | 556 ++++++++++++ .../skills/together-fine-tuning/SKILL.md | 88 ++ .../references/data-formats.md | 368 ++++++++ .../references/deployment.md | 234 +++++ .../references/supported-models.md | 185 ++++ .../scripts/dpo_workflow.py | 267 ++++++ .../scripts/finetune_workflow.py | 240 +++++ .../scripts/function_calling_finetune.py | 309 +++++++ .../scripts/reasoning_finetune.py | 279 ++++++ .../scripts/vlm_finetune.py | 228 +++++ .../skills/together-gpu-clusters/SKILL.md | 88 ++ .../references/api-reference.md | 478 ++++++++++ .../references/cluster-management.md | 464 ++++++++++ .../references/tcloud-cli.md | 313 +++++++ .../scripts/manage_cluster.py | 255 ++++++ .../scripts/manage_cluster.ts | 143 +++ .../scripts/manage_storage.py | 129 +++ .../skills/together-images/SKILL.md | 82 ++ .../references/api-reference.md | 368 ++++++++ .../together-images/references/models.md | 116 +++ .../together-images/scripts/generate_image.py | 147 ++++ .../together-images/scripts/generate_image.ts | 90 ++ .../scripts/kontext_editing.py | 186 ++++ .../scripts/lora_generation.py | 100 +++ .../skills/together-sandboxes/SKILL.md | 68 ++ .../references/api-reference.md | 276 ++++++ .../scripts/execute_with_session.py | 130 +++ .../scripts/execute_with_session.ts | 131 +++ .../togetherai/skills/together-video/SKILL.md | 70 ++ .../references/api-reference.md | 312 +++++++ .../together-video/references/models.md | 62 ++ .../together-video/scripts/generate_video.py | 130 +++ .../together-video/scripts/generate_video.ts | 116 +++ .../together-video/scripts/image_to_video.py | 160 ++++ 93 files changed, 18290 insertions(+) create mode 100644 plugins/togetherai/LICENSE.togetherai-skills create mode 100644 plugins/togetherai/README.md create mode 100644 plugins/togetherai/index.ts create mode 100644 plugins/togetherai/package.json create mode 100644 plugins/togetherai/skills/together-audio/SKILL.md create mode 100644 plugins/togetherai/skills/together-audio/references/stt-models.md create mode 100644 plugins/togetherai/skills/together-audio/references/tts-models.md create mode 100644 plugins/togetherai/skills/together-audio/scripts/stt_realtime.py create mode 100644 plugins/togetherai/skills/together-audio/scripts/stt_transcribe.py create mode 100644 plugins/togetherai/skills/together-audio/scripts/stt_transcribe.ts create mode 100644 plugins/togetherai/skills/together-audio/scripts/tts_generate.py create mode 100644 plugins/togetherai/skills/together-audio/scripts/tts_generate.ts create mode 100644 plugins/togetherai/skills/together-audio/scripts/tts_websocket.py create mode 100644 plugins/togetherai/skills/together-batch-inference/SKILL.md create mode 100644 plugins/togetherai/skills/together-batch-inference/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.py create mode 100644 plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.ts create mode 100644 plugins/togetherai/skills/together-chat-completions/SKILL.md create mode 100644 plugins/togetherai/skills/together-chat-completions/references/api-parameters.md create mode 100644 plugins/togetherai/skills/together-chat-completions/references/function-calling-patterns.md create mode 100644 plugins/togetherai/skills/together-chat-completions/references/models.md create mode 100644 plugins/togetherai/skills/together-chat-completions/references/reasoning-models.md create mode 100644 plugins/togetherai/skills/together-chat-completions/references/structured-outputs.md create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/async_parallel.py create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.py create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.ts create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.py create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.ts create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.py create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.ts create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.py create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.ts create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.py create mode 100644 plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.ts create mode 100644 plugins/togetherai/skills/together-dedicated-containers/SKILL.md create mode 100644 plugins/togetherai/skills/together-dedicated-containers/references/jig-cli.md create mode 100644 plugins/togetherai/skills/together-dedicated-containers/references/sprocket-sdk.md create mode 100644 plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.py create mode 100644 plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.ts create mode 100644 plugins/togetherai/skills/together-dedicated-containers/scripts/sprocket_hello_world.py create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/SKILL.md create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/references/dedicated-models.md create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/references/hardware-options.md create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/scripts/deploy_finetuned.py create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.py create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.ts create mode 100644 plugins/togetherai/skills/together-dedicated-endpoints/scripts/upload_custom_model.py create mode 100644 plugins/togetherai/skills/together-embeddings/SKILL.md create mode 100644 plugins/togetherai/skills/together-embeddings/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-embeddings/references/models.md create mode 100644 plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.py create mode 100644 plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.ts create mode 100644 plugins/togetherai/skills/together-embeddings/scripts/rag_pipeline.py create mode 100644 plugins/togetherai/skills/together-embeddings/scripts/semantic_search.py create mode 100644 plugins/togetherai/skills/together-evaluations/SKILL.md create mode 100644 plugins/togetherai/skills/together-evaluations/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.py create mode 100644 plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.ts create mode 100644 plugins/togetherai/skills/together-fine-tuning/SKILL.md create mode 100644 plugins/togetherai/skills/together-fine-tuning/references/data-formats.md create mode 100644 plugins/togetherai/skills/together-fine-tuning/references/deployment.md create mode 100644 plugins/togetherai/skills/together-fine-tuning/references/supported-models.md create mode 100644 plugins/togetherai/skills/together-fine-tuning/scripts/dpo_workflow.py create mode 100644 plugins/togetherai/skills/together-fine-tuning/scripts/finetune_workflow.py create mode 100644 plugins/togetherai/skills/together-fine-tuning/scripts/function_calling_finetune.py create mode 100644 plugins/togetherai/skills/together-fine-tuning/scripts/reasoning_finetune.py create mode 100644 plugins/togetherai/skills/together-fine-tuning/scripts/vlm_finetune.py create mode 100644 plugins/togetherai/skills/together-gpu-clusters/SKILL.md create mode 100644 plugins/togetherai/skills/together-gpu-clusters/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-gpu-clusters/references/cluster-management.md create mode 100644 plugins/togetherai/skills/together-gpu-clusters/references/tcloud-cli.md create mode 100644 plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.py create mode 100644 plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.ts create mode 100644 plugins/togetherai/skills/together-gpu-clusters/scripts/manage_storage.py create mode 100644 plugins/togetherai/skills/together-images/SKILL.md create mode 100644 plugins/togetherai/skills/together-images/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-images/references/models.md create mode 100644 plugins/togetherai/skills/together-images/scripts/generate_image.py create mode 100644 plugins/togetherai/skills/together-images/scripts/generate_image.ts create mode 100644 plugins/togetherai/skills/together-images/scripts/kontext_editing.py create mode 100644 plugins/togetherai/skills/together-images/scripts/lora_generation.py create mode 100644 plugins/togetherai/skills/together-sandboxes/SKILL.md create mode 100644 plugins/togetherai/skills/together-sandboxes/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.py create mode 100644 plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.ts create mode 100644 plugins/togetherai/skills/together-video/SKILL.md create mode 100644 plugins/togetherai/skills/together-video/references/api-reference.md create mode 100644 plugins/togetherai/skills/together-video/references/models.md create mode 100644 plugins/togetherai/skills/together-video/scripts/generate_video.py create mode 100644 plugins/togetherai/skills/together-video/scripts/generate_video.ts create mode 100644 plugins/togetherai/skills/together-video/scripts/image_to_video.py diff --git a/README.md b/README.md index a400e81c..4c80a0b0 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Each plugin lives in `plugins/`. The directory name is the install keyword | `mac-notify` | macOS notifications when a Cline run completes. | | `nanobanana` | Image generation through OpenRouter and Gemini image models. | | `speak` | Speaks completed Cline replies with ElevenLabs text to speech. | +| `togetherai` | Together AI workflow skills for inference, training, media, evaluation, and infrastructure. | | `typescript-lsp` | TypeScript language service `goto_definition` support. | | `weather-metrics` | Demo weather tool plus runtime metrics hooks. | | `web-search` | Exa-backed web search as a Cline tool. | diff --git a/plugins/togetherai/LICENSE.togetherai-skills b/plugins/togetherai/LICENSE.togetherai-skills new file mode 100644 index 00000000..5f4a6d28 --- /dev/null +++ b/plugins/togetherai/LICENSE.togetherai-skills @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Together AI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/plugins/togetherai/README.md b/plugins/togetherai/README.md new file mode 100644 index 00000000..bb2d3167 --- /dev/null +++ b/plugins/togetherai/README.md @@ -0,0 +1,37 @@ +# Together AI + +Together AI workflow skills for Cline. + +## What It Does + +This plugin bundles Together AI skills for chat completions, batch inference, embeddings, evaluations, fine-tuning, images, video, audio, sandboxes, dedicated endpoints, dedicated containers, and GPU clusters. + +Each skill includes workflow guidance plus local reference files and example Python or TypeScript scripts. The plugin does not register an MCP server and does not run Together AI calls during install. + +The plugin also adds a Together AI safety rule so Cline asks before running scripts, installing SDKs, spending credits, uploading data, creating or deleting endpoints, launching clusters, or using remote execution. + +## Install + +```bash +cline plugin install togetherai +``` + +For local development from this repository: + +```bash +cline plugin install ./plugins/togetherai --cwd . +``` + +## Requirements + +- `TOGETHER_API_KEY` in the environment before running Together AI API examples. +- Python examples generally expect `together>=2.0.0`; TypeScript examples expect `together-ai`. +- Some workflows may also require external provider keys, Docker/container tooling, Kubernetes/Slurm access, Jig, Sprocket, or Together Cloud cluster permissions. + +## Security Notes + +Together AI workflows can spend credits, upload datasets or models, generate media, execute remote code, and provision billable infrastructure. Review scripts and target resources before running them, keep API keys out of source control, and clean up endpoints, clusters, sandboxes, storage, and generated artifacts when they are no longer needed. + +## Attribution + +The bundled Together AI skill material is MIT licensed. See `LICENSE.togetherai-skills`. diff --git a/plugins/togetherai/index.ts b/plugins/togetherai/index.ts new file mode 100644 index 00000000..92e348bf --- /dev/null +++ b/plugins/togetherai/index.ts @@ -0,0 +1,24 @@ +import type { AgentPlugin } from "@cline/sdk" + +const togetherAiRule = [ + "Together AI skills can create paid API calls, generated media, remote code runs, fine-tuning jobs, dedicated endpoints, containers, GPU clusters, and storage resources.", + "Do not run bundled scripts, install SDKs, submit jobs, create/delete infrastructure, upload training data or models, or spend API credits without explicit user approval.", + "Treat TOGETHER_API_KEY, external provider tokens, datasets, prompts, generated media URLs, model outputs, cluster credentials, and evaluation results as sensitive unless the user says otherwise.", + "Prefer read-only planning and local validation first. For destructive or cost-bearing workflows, state the target resource, expected cost/risk, and rollback or cleanup plan before proceeding.", +].join("\n") + +const plugin: AgentPlugin = { + name: "togetherai", + manifest: { + capabilities: ["skills", "rules"], + }, + setup(api) { + api.registerRule({ + id: "togetherai-safety", + source: "togetherai", + content: togetherAiRule, + }) + }, +} + +export default plugin diff --git a/plugins/togetherai/package.json b/plugins/togetherai/package.json new file mode 100644 index 00000000..1870a584 --- /dev/null +++ b/plugins/togetherai/package.json @@ -0,0 +1,20 @@ +{ + "name": "togetherai", + "version": "0.0.0", + "private": true, + "type": "module", + "description": "Cline plugin that bundles Together AI workflow skills.", + "cline": { + "plugins": [ + { + "paths": [ + "./index.ts" + ], + "capabilities": [ + "skills", + "rules" + ] + } + ] + } +} diff --git a/plugins/togetherai/skills/together-audio/SKILL.md b/plugins/togetherai/skills/together-audio/SKILL.md new file mode 100644 index 00000000..afad051b --- /dev/null +++ b/plugins/togetherai/skills/together-audio/SKILL.md @@ -0,0 +1,84 @@ +--- +name: together-audio +description: "Text-to-speech and speech-to-text via Together AI, including REST, streaming, and realtime WebSocket TTS, plus transcription, translation, diarization, timestamps, and live STT. Reach for it whenever the user needs audio in or audio out on Together AI rather than chat generation, image or video creation, or model training." +--- + +# Together Audio + +## Overview + +Use Together AI audio APIs for: + +- text-to-speech generation +- streaming or realtime voice output +- speech-to-text transcription +- translation, diarization, and timestamps +- live captioning and realtime transcription + +## When This Skill Wins + +- Generate spoken audio from text +- Transcribe uploaded audio files or URLs +- Add realtime voice or captioning to an app +- Extract speaker segments or word timings + +## Hand Off To Another Skill + +- Use `together-chat-completions` for text-only generation +- Use `together-video` or `together-images` for visual generation workflows +- Use `together-dedicated-endpoints` only when the audio model itself must be hosted on dedicated infrastructure + +## Quick Routing + +- REST TTS or streaming TTS + - Read [references/tts-models.md](references/tts-models.md) + - Start with [scripts/tts_generate.py](scripts/tts_generate.py) or [scripts/tts_generate.ts](scripts/tts_generate.ts) +- Realtime TTS over WebSocket + - Read [references/tts-models.md](references/tts-models.md) + - Start with [scripts/tts_websocket.py](scripts/tts_websocket.py) +- File transcription, translation, diarization, or timestamps + - Read [references/stt-models.md](references/stt-models.md) + - Start with [scripts/stt_transcribe.py](scripts/stt_transcribe.py) or [scripts/stt_transcribe.ts](scripts/stt_transcribe.ts) +- Realtime STT + - Read [references/stt-models.md](references/stt-models.md) + - Start with [scripts/stt_realtime.py](scripts/stt_realtime.py) + +## Workflow + +1. Confirm whether the task is TTS or STT. +2. Choose REST, streaming, or realtime transport based on latency and interaction needs. +3. Pick the model and response format from the relevant reference file. +4. Start from the matching script instead of rebuilding the request contract from memory. +5. For Python STT uploads, open audio files in binary mode and pass the file handle rather than a bare path string. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Use `client.audio.speech.create()` for TTS. +- REST TTS returns a `BinaryAPIResponse`; call `response.write_to_file(path)` to save it. Do NOT use `stream_to_file` (it does not exist on this object). +- Streaming TTS (`stream=True`) returns a `Stream` of `AudioSpeechStreamChunk` objects. Iterate chunks, check `chunk.type`, and decode `base64.b64decode(chunk.delta)` for audio data. There is no file-writing helper on the stream object. +- Use `client.audio.transcriptions.create()` for transcription and `client.audio.translations.create()` for translation. +- Batch transcription and translation share hard limits: 500 MB direct upload, 1 GB URL-fetch, 4 hours of audio per request. For larger payloads, pass a public HTTPS URL on `file=`; for longer audio, split into "‰¤ 4 h chunks. See the Limits section of [references/stt-models.md](references/stt-models.md). +- Realtime APIs require audio-format discipline; confirm PCM expectations before streaming bytes. +- Diarization and word timestamps change response shape; code for the richer verbose output explicitly. + +## Resource Map + +- TTS reference: [references/tts-models.md](references/tts-models.md) +- STT reference: [references/stt-models.md](references/stt-models.md) +- Python TTS workflow: [scripts/tts_generate.py](scripts/tts_generate.py) +- TypeScript TTS workflow: [scripts/tts_generate.ts](scripts/tts_generate.ts) +- Python realtime TTS workflow: [scripts/tts_websocket.py](scripts/tts_websocket.py) +- Python STT workflow: [scripts/stt_transcribe.py](scripts/stt_transcribe.py) +- TypeScript STT workflow: [scripts/stt_transcribe.ts](scripts/stt_transcribe.ts) +- Python realtime STT workflow: [scripts/stt_realtime.py](scripts/stt_realtime.py) + +## Official Docs + +- [Text-to-Speech](https://docs.together.ai/docs/text-to-speech) +- [Speech-to-Text](https://docs.together.ai/docs/speech-to-text) +- [TTS REST API](https://docs.together.ai/reference/audio-speech) +- [TTS WebSocket API](https://docs.together.ai/reference/audio-speech-websocket) +- [Audio Transcriptions API](https://docs.together.ai/reference/audio-transcriptions) +- [Audio Translations API](https://docs.together.ai/reference/audio-translations) +- [Realtime Audio Transcriptions API](https://docs.together.ai/reference/audio-transcriptions-realtime) diff --git a/plugins/togetherai/skills/together-audio/references/stt-models.md b/plugins/togetherai/skills/together-audio/references/stt-models.md new file mode 100644 index 00000000..73402e4e --- /dev/null +++ b/plugins/togetherai/skills/together-audio/references/stt-models.md @@ -0,0 +1,302 @@ +# STT Models & Transcription Reference +## Contents + +- [Model Catalog](#model-catalog) +- [Supported Input Formats](#supported-input-formats) +- [Limits](#limits) +- [Audio Transcriptions](#audio-transcriptions) +- [Audio Translations](#audio-translations) +- [Realtime Transcription (WebSocket)](#realtime-transcription) +- [Response Formats](#response-formats) +- [Errors and Troubleshooting](#errors-and-troubleshooting) +- [Common Workflows](#common-workflows) +- [Async Support](#async-support) + + +## Model Catalog + +These models are current in the latest speech-to-text guide and are not listed in the current deprecation history. + +| Model | API String | Access | Capabilities | +|-------|-----------|--------|--------------| +| Whisper Large v3 | `openai/whisper-large-v3` | Serverless | Realtime, translation, diarization | +| Deepgram Flux | `deepgram/deepgram-flux` | Dedicated / Reserved | Realtime | +| Deepgram Nova 3 | `deepgram/deepgram-nova-3` | Dedicated / Reserved | Transcription | +| Deepgram Nova 3 Multilingual | `deepgram/deepgram-nova-3-multilingual` | Dedicated / Reserved | Transcription | +| Parakeet TDT 0.6B v3 | `nvidia/parakeet-tdt-0.6b-v3` | Serverless | Realtime, diarization | +| Nemotron 3 ASR Streaming 0.6B | `nvidia/nemotron-3-asr-streaming-0.6b` | Serverless | Streaming transcription | +| Nemotron 3.5 ASR Streaming 0.6B | `nvidia/nemotron-3.5-asr-streaming-0.6b` | Serverless | Streaming transcription | + +Notes: +- The `/audio/transcriptions` and `/audio/translations` reference schemas currently enumerate + `openai/whisper-large-v3` in the request body. +- The broader guide model catalog also includes Parakeet and dedicated-endpoint Deepgram models. + +## Supported Input Formats + +The guide and reference both list: +- `.wav` (`audio/wav`) +- `.mp3` (`audio/mpeg`) +- `.m4a` (`audio/mp4`) +- `.webm` (`audio/webm`) +- `.flac` (`audio/flac`) + +## Limits + +Both `/v1/audio/transcriptions` and `/v1/audio/translations` enforce the same caps. + +| Limit | Value | Notes | +|-------|-------|-------| +| Max file size (direct upload) | 500 MB | Requests above this are rejected at the edge with `HTTP 413 Payload Too Large`. | +| Max file size (URL fetch) | 1 GB | When you submit an HTTPS URL on the `file` field instead of binary, the server downloads up to 1 GB. Larger downloads fail with `400 file_too_large`. | +| Max audio duration | 4 hours per request | Longer audio is rejected with `400 audio_too_long`. Split into "‰¤ 4 h segments and submit separately. | + +Tips: +- For payloads larger than 500 MB, host the file at a public HTTPS URL and pass that URL as the `file` field - the 500 MB edge cap only applies to direct uploads. +- For audio longer than 4 hours, split into "‰¤ 4 h chunks before submitting. +- Real-time/streaming transports are unaffected by these batch upload limits. + +## Audio Transcriptions + +Use `/v1/audio/transcriptions` to transcribe speech into text in the same language as the source audio. + +### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `file` | string / file / path / URL | Yes | Audio file upload or public HTTP/HTTPS URL | +| `model` | string | No | Defaults to `openai/whisper-large-v3` in the reference | +| `language` | string | No | ISO 639-1 code; `auto` enables auto-detection | +| `prompt` | string | No | Optional text to bias decoding | +| `response_format` | string | No | `json` or `verbose_json` | +| `temperature` | float | No | `0.0` to `1.0` | +| `timestamp_granularities` | string or array | No | `segment` and/or `word`; only used with `verbose_json` | +| `diarize` | bool | No | Enable speaker diarization | +| `min_speakers` | int | No | Minimum expected speakers | +| `max_speakers` | int | No | Maximum expected speakers | + +Language examples called out in the guide: +- `en` -- English +- `es` -- Spanish +- `fr` -- French +- `de` -- German +- `ja` -- Japanese +- `zh` -- Chinese +- `auto` -- auto-detect + +### Input Methods + +The guide shows all of these as valid Python inputs: + +```python +from pathlib import Path + +file="/path/to/audio.mp3" +file=Path("recordings/interview.wav") +file="https://example.com/audio.mp3" +file=open("audio.mp3", "rb") +``` + +## Audio Translations + +Use `/v1/audio/translations` to translate spoken audio. The guide frames this as translation to English. The current +reference also documents an optional `language` parameter whose default is `en`. + +### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `file` | string / file / path / URL | Yes | Audio file upload or public HTTP/HTTPS URL | +| `model` | string | No | Defaults to `openai/whisper-large-v3` | +| `language` | string | No | Target output language; default `en` | +| `prompt` | string | No | Optional text to bias decoding | +| `response_format` | string | No | `json` or `verbose_json` | +| `temperature` | float | No | `0.0` to `1.0` | +| `timestamp_granularities` | string or array | No | `segment` and/or `word` for `verbose_json` | + +## Realtime Transcription (WebSocket) + +Use the realtime API when you need incremental transcription. + +Connection URL: + +```text +wss://api.together.ai/v1/realtime?model={model}&input_audio_format=pcm_s16le_16000 +``` + +Audio requirements: +- PCM signed 16-bit little-endian +- 16 kHz sample rate +- base64-encoded in `input_audio_buffer.append` + +### Authentication + +The reference documents Bearer auth. The speech-to-text guide also shows raw websocket examples using: +- `Authorization: Bearer YOUR_API_KEY` +- `OpenAI-Beta: realtime=v1` + +Some websocket client examples in the guide also use the `realtime`, `openai-insecure-api-key.*`, and +`openai-beta.realtime-v1` subprotocol pattern. + +### Query Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Realtime STT model such as `openai/whisper-large-v3` | +| `input_audio_format` | string | Yes | Use `pcm_s16le_16000` | + +### Client Events + +```json +{"type": "input_audio_buffer.append", "audio": ""} +{"type": "input_audio_buffer.commit"} +``` + +### Server Events + +```json +{"type": "session.created", "session": {"model": "openai/whisper-large-v3"}} +{"type": "conversation.item.input_audio_transcription.delta", "delta": "The quick brown"} +{"type": "conversation.item.input_audio_transcription.completed", "transcript": "The quick brown fox jumps over the lazy dog"} +{"type": "conversation.item.input_audio_transcription.failed", "error": {"message": "Error description"}} +``` + +Delta semantics: +- `conversation.item.input_audio_transcription.delta` is an interim result +- each delta can replace the previous delta +- `conversation.item.input_audio_transcription.completed` is the finalized text chunk + +## Response Formats + +### JSON + +```json +{"text": "Hello, this is a test recording."} +``` + +### Verbose JSON + +`verbose_json` can include: +- `text` +- `language` +- `duration` +- `segments` +- `words` +- `speaker_segments` + +Segment example: + +```json +{"start": 0.11, "end": 10.85, "text": "..."} +``` + +Word example: + +```json +{"word": "Hello", "start": 0.00, "end": 0.36} +``` + +Speaker segment example: + +```json +{ + "speaker_id": "SPEAKER_01", + "start": 6.268, + "end": 30.776, + "text": "...", + "words": [{"word": "Hello", "start": 6.268, "end": 11.314, "speaker_id": "SPEAKER_01"}] +} +``` + +## Errors and Troubleshooting + +`/v1/audio/transcriptions` and `/v1/audio/translations` share the same code path and return the same error codes. + +| Response | Meaning | Recommended action | +|----------|---------|--------------------| +| `400 audio_too_long` | Audio duration exceeds the 4 hour cap. | Split the file into "‰¤ 4 h segments and submit separately. | +| `400 file_too_large` | A URL-fetched audio download exceeded the 1 GB server-side cap. | Compress the source, or split into smaller files. | +| `400 unsupported_format` | The audio container or codec could not be decoded. | Re-encode to a supported format. Run `ffprobe` on the file to confirm it is valid audio. | +| `400 invalid_params` | Request parameters failed validation. | Check the API reference for the endpoint. | +| `413 Payload Too Large` | A direct upload exceeded the 500 MB edge limit. | Submit the file via an HTTPS URL on the `file` field instead, or split the file. The 413 response is plain HTML, not JSON. | +| `429` | Rate limit exceeded. | See serverless rate limits. | +| `500 processing_failed` | Internal decode failure after the file was accepted. | Verify the file is valid audio with `ffprobe`. If it is, contact Together support with the response `id`. | + +## Common Workflows + +### Basic Transcription + +```python +from together import Together + +client = Together() +with open("meeting_recording.mp3", "rb") as audio_file: + response = client.audio.transcriptions.create( + file=audio_file, + model="openai/whisper-large-v3", + language="en", + response_format="json", + ) +print(response.text) +``` + +### Translation + +```python +with open("french_audio.mp3", "rb") as audio_file: + response = client.audio.translations.create( + file=audio_file, + model="openai/whisper-large-v3", + ) +print(response.text) +``` + +### Diarization + +```python +with open("meeting.mp3", "rb") as audio_file: + response = client.audio.transcriptions.create( + file=audio_file, + model="openai/whisper-large-v3", + response_format="verbose_json", + diarize=True, + min_speakers=1, + max_speakers=5, + ) +print(response.speaker_segments) +``` + +### Word-level Timestamps + +```python +with open("audio.mp3", "rb") as audio_file: + response = client.audio.transcriptions.create( + file=audio_file, + model="openai/whisper-large-v3", + response_format="verbose_json", + timestamp_granularities="word", + ) +for word in response.words: + print(f"{word.word}: {word.start:.2f}s-{word.end:.2f}s") +``` + +## Async Support + +```python +import asyncio +from together import AsyncTogether + + +async def transcribe_audio() -> str: + client = AsyncTogether() + with open("audio.mp3", "rb") as audio_file: + response = await client.audio.transcriptions.create( + file=audio_file, + model="openai/whisper-large-v3", + language="en", + ) + return response.text + + +print(asyncio.run(transcribe_audio())) +``` diff --git a/plugins/togetherai/skills/together-audio/references/tts-models.md b/plugins/togetherai/skills/together-audio/references/tts-models.md new file mode 100644 index 00000000..2589d02c --- /dev/null +++ b/plugins/togetherai/skills/together-audio/references/tts-models.md @@ -0,0 +1,227 @@ +# TTS Models & Voice Reference +## Contents + +- [Model Catalog](#model-catalog) +- [REST Parameters](#rest-parameters) +- [Streaming HTTP](#streaming-http) +- [Realtime WebSocket](#realtime-websocket) +- [Response Formats](#response-formats) +- [Voice Discovery](#voice-discovery) +- [Voice Lists](#voice-lists) + + +## Model Catalog + +These models are current in the latest text-to-speech guide and are not listed in the current deprecation history. + +| Model | API String | Access | Endpoints | Pricing / Notes | +|-------|-----------|--------|-----------|-----------------| +| Orpheus 3B | `canopylabs/orpheus-3b-0.1-ft` | Serverless | REST, Streaming, WebSocket | $15 per 1M characters | +| Kokoro | `hexgrad/Kokoro-82M` | Serverless | REST, Streaming, WebSocket | $4 per 1M characters | +| Cartesia Sonic 3 | `cartesia/sonic-3` | Serverless / Dedicated / Reserved | REST | Build Tier 2+ | +| Cartesia Sonic 2 | `cartesia/sonic-2` | Serverless / Dedicated / Reserved | REST | $65 per 1M characters, Build Tier 2+ | +| Cartesia Sonic | `cartesia/sonic` | Serverless | REST | Listed in `/audio/speech` reference enum | +| Deepgram Aura 2 | `deepgram/deepgram-aura-2` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | +| Rime Arcana v3 Turbo | `rime-labs/rime-arcana-v3-turbo` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | +| Rime Arcana v3 | `rime-labs/rime-arcana-v3` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | +| Rime Arcana v2 | `rime-labs/rime-arcana-v2` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | +| Rime Mist v3 (Beta) | `rime-labs/rime-mist-v3` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | +| Rime Mist v2 | `rime-labs/rime-mist-v2` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | +| MiniMax Speech 2.8 Turbo | `minimax/speech-2.8-turbo` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | +| MiniMax Speech 2.6 Turbo | `minimax/speech-2.6-turbo` | Dedicated / Reserved | REST, Streaming, WebSocket | Dedicated only | + +Notes: +- Orpheus and Kokoro support realtime WebSocket streaming for the lowest-latency TTS flows. +- Cartesia Sonic 2 and Sonic 3 require Build Tier 2 or higher on serverless and are also available on Dedicated and + Reserved Endpoints. +- The `/audio/speech` API reference currently enumerates `cartesia/sonic`, `hexgrad/Kokoro-82M`, and + `canopylabs/orpheus-3b-0.1-ft` in the request schema. + +## REST Parameters + +Use `/v1/audio/speech` for standard HTTP generation. + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | TTS model identifier | +| `input` | string | Yes | Text to synthesize | +| `voice` | string | Yes | Voice ID | +| `response_format` | string | No | `mp3`, `wav`, `raw`, `mulaw`; MiniMax also supports `opus`, `aac`, `flac` in the guide | +| `sample_rate` | int | No | Output sample rate in Hz | +| `language` | string | No | Input language code such as `en`, `fr`, `es` | +| `alignment` | string | No | `none` or `word` | +| `segment` | string | No | `sentence`, `immediate`, or `never` | +| `response_encoding` | string | No | `pcm_f32le`, `pcm_s16le`, `pcm_mulaw`, `pcm_alaw` | +| `stream` | bool | No | Stream as SSE instead of waiting for the full file | + +Current behavior from the docs and reference: +- `response_format` defaults to `wav` +- when `stream=true`, the only supported HTTP `response_format` is `raw` +- `alignment=word` is only supported on streaming requests +- `/audio/speech` documents default sample rates of `24000` for Orpheus/Kokoro and `44100` for `cartesia/sonic` + +SDK response types (Python v2): +- Non-streaming: `client.audio.speech.create()` returns a `BinaryAPIResponse`. Save with `response.write_to_file(path)`. +- Streaming (`stream=True`): returns a `Stream[AudioSpeechStreamChunk]`. Iterate chunks, check `chunk.type`, and use `base64.b64decode(chunk.delta)` for `conversation.item.audio_output.delta` events. +- The SDK `create()` method accepts: `model`, `input`, `voice`, `language`, `response_encoding`, `response_format`, `sample_rate`, `stream`. Pass `alignment` and `segment` via `extra_body={"alignment": ..., "segment": ...}`. + +## Streaming HTTP + +When `stream=true`, the HTTP endpoint returns server-sent events. + +Event examples: + +```json +data: {"type":"conversation.item.audio_output.delta","item_id":"tts_1","delta":""} +``` + +```json +data: {"type":"conversation.item.word_timestamps","words":["Hello","world"],"start_seconds":[0.0,0.4],"end_seconds":[0.4,0.8]} +``` + +```text +data: [DONE] +``` + +Use streaming HTTP when: +- you want lower time-to-first-byte without moving to raw WebSockets +- you want `conversation.item.word_timestamps` via `alignment=word` +- you are comfortable consuming SSE and decoding audio client-side + +## Realtime WebSocket + +Connection: + +```text +wss://api.together.ai/v1/audio/speech/websocket +``` + +Authentication: +- `Authorization: Bearer YOUR_API_KEY` +- or `?api_key=YOUR_API_KEY` + +The guide and reference document these query parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `model` | string | TTS model identifier | +| `voice` | string | Voice ID | +| `response_format` | string | `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` | +| `speed` | float | Playback speed, default `1.0` | +| `max_partial_length` | int | Character threshold before forced generation | +| `sample_rate` | int | Output sample rate in Hz | +| `language` | string | Language code such as `en`, `fr`, `es` | +| `alignment` | string | `none` or `word` | +| `segment` | string | `sentence`, `immediate`, or `never` | + +You can pass these in the URL or update them later with `tts_session.updated`. + +### Client Events + +- `input_text_buffer.append` -- append text to the server buffer +- `input_text_buffer.commit` -- force synthesis of buffered text +- `input_text_buffer.clear` -- clear pending buffered text +- `tts_session.updated` -- update voice or other session options + +### Server Events + +- `session.created` -- initial session metadata +- `conversation.item.input_text.received` -- text acknowledged +- `conversation.item.audio_output.delta` -- base64-encoded audio chunks +- `conversation.item.audio_output.done` -- synthesis finished for an item +- `conversation.item.word_timestamps` -- emitted when `alignment=word` +- `conversation.item.tts.failed` -- error payload + +### Audio Format + +The reference documents realtime audio deltas as: +- base64-encoded chunks +- WAV / PCM s16le +- 24 kHz default sample rate in the documented examples + +If you request `response_format=pcm`, the payload is convenient to save directly as a `.pcm` file. + +## Response Formats + +| Format | Extension | Description | Notes | +|--------|-----------|-------------|-------| +| `wav` | `.wav` | Uncompressed audio | Standard file output | +| `mp3` | `.mp3` | Compressed audio | Smaller files | +| `raw` | `.pcm` | Raw PCM bytes | Required for HTTP streaming | +| `mulaw` | `.ulaw` | Telephony-friendly μ-law | Useful for phone pipelines | +| `pcm` | `.pcm` | Realtime WebSocket PCM output | WebSocket query parameter | +| `opus` | `.opus` | Compressed audio | WebSocket / MiniMax guide coverage | +| `aac` | `.aac` | Compressed audio | WebSocket / MiniMax guide coverage | +| `flac` | `.flac` | Lossless compressed audio | WebSocket / MiniMax guide coverage | + +## Voice Discovery + +List voices programmatically: + +```python +from together import Together + +client = Together() +response = client.audio.voices.list() +for model_voices in response.data: + print(f"Model: {model_voices.model}") + for voice in model_voices.voices: + print(f" - {voice.name}") +``` + +Or via cURL: + +```bash +curl -X GET "https://api.together.ai/v1/voices?model=canopylabs/orpheus-3b-0.1-ft" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +## Voice Lists + +### Orpheus + +`tara`, `leah`, `jess`, `leo`, `dan`, `mia`, `zac`, `zoe` + +### Kokoro + +`af_heart`, `af_alloy`, `af_aoede`, `af_bella`, `af_jessica`, `af_kore`, `af_nicole`, `af_nova`, +`af_river`, `af_sarah`, `af_sky`, `am_adam`, `am_echo`, `am_eric`, `am_fenrir`, `am_liam`, +`am_michael`, `am_onyx`, `am_puck`, `am_santa`, `bf_alice`, `bf_emma`, `bf_isabella`, `bf_lily`, +`bm_daniel`, `bm_fable`, `bm_george`, `bm_lewis`, `jf_alpha`, `jf_gongitsune`, `jf_nezumi`, +`jf_tebukuro`, `jm_kumo`, `zf_xiaobei`, `zf_xiaoni`, `zf_xiaoxiao`, `zf_xiaoyi`, `zm_yunjian`, +`zm_yunxi`, `zm_yunxia`, `zm_yunyang`, `ef_dora`, `em_alex`, `em_santa`, `ff_siwis`, `hf_alpha`, +`hf_beta`, `hm_omega`, `hm_psi`, `if_sara`, `im_nicola`, `pf_dora`, `pm_alex`, `pm_santa` + +### Cartesia + +The guide lists a large shared Cartesia voice catalog, including: +`friendly sidekick`, `reading lady`, `newsman`, `child`, `meditation lady`, `maria`, `calm lady`, +`helpful woman`, `reading man`, `new york man`, `barbershop man`, `customer support man`, `sarah`, +`laidback woman`, `reflective woman`, `professional woman`, `california girl`, `john`, `anna` + +Regional examples include: +`german conversational woman`, `french conversational lady`, `indian lady`, `british reading lady`, +`japanese children book`, `korean narrator woman`, `russian calm lady`, `chinese female conversational`, +`spanish narrator man`, `dutch confident man`, `hindi reporter man`, `italian calm man`, +`swedish narrator man`, `polish confident man` + +### Rime Mist v2 / v3 + +`cove`, `lagoon`, `mari`, `moon`, `moraine`, `peak`, `summit`, `talon`, `thunder`, `tundra`, +`wildflower` + +### Rime Arcana v2 / v3 / v3 Turbo + +`albion`, `arcade`, `astra`, `atrium`, `bond`, `cupola`, `eliphas`, `estelle`, `eucalyptus`, `fern`, +`lintel`, `luna`, `lyra`, `marlu`, `masonry`, `moss`, `oculus`, `parapet`, `pilaster`, `sirius`, +`stucco`, `transom`, `truss`, `vashti`, `vespera`, `walnut` + +### MiniMax Speech 2.6 Turbo + +`English_DeterminedMan`, `English_Diligent_Man`, `English_expressive_narrator`, +`English_FriendlyNeighbor`, `English_Graceful_Lady`, `Japanese_GentleButler` + +### MiniMax Speech 2.8 Turbo + +`English_CalmWoman`, `English_CaptivatingStoryteller`, `English_CharmingQueen`, +`English_Comedian`, `English_ConfidentWoman`, `English_Cute_Girl` diff --git a/plugins/togetherai/skills/together-audio/scripts/stt_realtime.py b/plugins/togetherai/skills/together-audio/scripts/stt_realtime.py new file mode 100644 index 00000000..66690718 --- /dev/null +++ b/plugins/togetherai/skills/together-audio/scripts/stt_realtime.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Together AI realtime speech-to-text over WebSocket. + +Reads 16 kHz mono PCM audio from a WAV or raw PCM file, streams it to Together AI, +and prints interim and final transcription events. + +Usage: + python stt_realtime.py audio.wav + python stt_realtime.py audio.pcm --model openai/whisper-large-v3 + +Requirements: + pip install websockets + export TOGETHER_API_KEY=your_key +""" + +from __future__ import annotations + +import argparse +import asyncio +import base64 +import json +import os +import wave +from pathlib import Path + +import websockets + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI realtime STT example") + parser.add_argument("audio_file", help="Input WAV or raw PCM file") + parser.add_argument( + "--model", + default="openai/whisper-large-v3", + help="Realtime STT model", + ) + parser.add_argument( + "--chunk-size", + type=int, + default=4096, + help="Bytes per websocket append event", + ) + return parser.parse_args() + + +def load_pcm_s16le_16k(audio_path: Path) -> bytes: + """Load a 16 kHz mono PCM file from WAV or raw PCM.""" + if audio_path.suffix.lower() != ".wav": + return audio_path.read_bytes() + + with wave.open(str(audio_path), "rb") as wav_file: + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + sample_rate = wav_file.getframerate() + if channels != 1 or sample_width != 2 or sample_rate != 16000: + raise ValueError("Expected a mono 16-bit 16 kHz WAV file for realtime transcription.") + return wav_file.readframes(wav_file.getnframes()) + + +async def stream_audio(args: argparse.Namespace) -> None: + api_key = os.environ["TOGETHER_API_KEY"] + audio_path = Path(args.audio_file) + pcm_audio = load_pcm_s16le_16k(audio_path) + url = f"wss://api.together.ai/v1/realtime?model={args.model}&input_audio_format=pcm_s16le_16000" + headers = { + "Authorization": f"Bearer {api_key}", + "OpenAI-Beta": "realtime=v1", + } + + async with websockets.connect(url, additional_headers=headers) as ws: + commit_sent = asyncio.Event() + completed_transcripts: list[str] = [] + + async def receive_events() -> None: + while True: + try: + if commit_sent.is_set(): + message = await asyncio.wait_for(ws.recv(), timeout=1.0) + else: + message = await ws.recv() + except asyncio.TimeoutError: + return + + event = json.loads(message) + event_type = event.get("type") + + if event_type == "session.created": + print(f"Session created: {event['session']['id']}") + elif event_type == "conversation.item.input_audio_transcription.delta": + print(f"\r{event['delta']}", end="", flush=True) + elif event_type == "conversation.item.input_audio_transcription.completed": + transcript = str(event["transcript"]).strip() + if transcript: + completed_transcripts.append(transcript) + print(f"\nFinal: {transcript}") + elif event_type in {"conversation.item.input_audio_transcription.failed", "error"}: + raise RuntimeError(json.dumps(event)) + + receiver = asyncio.create_task(receive_events()) + + for start in range(0, len(pcm_audio), args.chunk_size): + chunk = pcm_audio[start : start + args.chunk_size] + payload = { + "type": "input_audio_buffer.append", + "audio": base64.b64encode(chunk).decode("utf-8"), + } + await ws.send(json.dumps(payload)) + + await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) + commit_sent.set() + await receiver + + if completed_transcripts: + print("\nCombined transcript:") + print(" ".join(completed_transcripts)) + + +def main() -> None: + args = parse_args() + asyncio.run(stream_audio(args)) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-audio/scripts/stt_transcribe.py b/plugins/togetherai/skills/together-audio/scripts/stt_transcribe.py new file mode 100644 index 00000000..6fa70a2e --- /dev/null +++ b/plugins/togetherai/skills/together-audio/scripts/stt_transcribe.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +""" +Together AI speech-to-text examples with the Python v2 SDK. + +Demonstrates: +- transcription +- translation +- diarization +- timestamps + +Usage: + python stt_transcribe.py transcribe audio.mp3 + python stt_transcribe.py translate foreign_audio.mp3 --target-language en + python stt_transcribe.py diarize meeting.mp3 --min-speakers 2 --max-speakers 5 + python stt_transcribe.py timestamps audio.mp3 --granularity word + +Requirements: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +from together import Together + +client = Together() + + +def transcribe( + audio_path: Path, + model: str, + language: str | None, + prompt: str | None, + temperature: float | None, +) -> str: + """Run a basic transcription request.""" + with open(audio_path, "rb") as audio_file: + payload: dict[str, object] = { + "file": audio_file, + "model": model, + "response_format": "json", + } + if language: + payload["language"] = language + if prompt: + payload["prompt"] = prompt + if temperature is not None: + payload["temperature"] = temperature + + response = client.audio.transcriptions.create(**payload) + print(f"Transcription: {response.text}") + return response.text + + +def translate( + audio_path: Path, + model: str, + target_language: str | None, + prompt: str | None, + temperature: float | None, +) -> str: + """Translate audio, defaulting to English if no target language is provided.""" + with open(audio_path, "rb") as audio_file: + payload: dict[str, object] = { + "file": audio_file, + "model": model, + } + if target_language: + payload["language"] = target_language + if prompt: + payload["prompt"] = prompt + if temperature is not None: + payload["temperature"] = temperature + + response = client.audio.translations.create(**payload) + print(f"Translation: {response.text}") + return response.text + + +def diarize(audio_path: Path, model: str, min_speakers: int, max_speakers: int) -> None: + """Run diarization and print speaker segments.""" + with open(audio_path, "rb") as audio_file: + response = client.audio.transcriptions.create( + file=audio_file, + model=model, + response_format="verbose_json", + diarize=True, + min_speakers=min_speakers, + max_speakers=max_speakers, + ) + if not response.speaker_segments: + print("No speaker segments returned.") + return + + for segment in response.speaker_segments: + print(f"[{segment.speaker_id}] {segment.start:.1f}s-{segment.end:.1f}s: {segment.text}") + + +def timestamps(audio_path: Path, model: str, granularity: str) -> None: + """Print timestamped transcription results.""" + with open(audio_path, "rb") as audio_file: + response = client.audio.transcriptions.create( + file=audio_file, + model=model, + response_format="verbose_json", + timestamp_granularities=granularity, + ) + print(f"Text: {response.text}") + print(f"Language: {response.language}") + print(f"Duration: {response.duration}s") + + if granularity == "word" and response.words: + for word in response.words: + print(f"'{word.word}' [{word.start:.2f}s - {word.end:.2f}s]") + return + + if response.segments: + for segment in response.segments: + print(f"[{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI STT examples") + parser.add_argument( + "mode", + choices=("transcribe", "translate", "diarize", "timestamps"), + help="Workflow to run", + ) + parser.add_argument("audio_file", help="Path to the input audio file") + parser.add_argument( + "--model", + default="openai/whisper-large-v3", + help="Speech-to-text model", + ) + parser.add_argument( + "--language", + default=None, + help="Source language for transcription or target language for translation when appropriate", + ) + parser.add_argument( + "--target-language", + default=None, + help="Optional target language for translations", + ) + parser.add_argument( + "--prompt", + default=None, + help="Optional prompt to bias decoding", + ) + parser.add_argument( + "--temperature", + type=float, + default=None, + help="Optional sampling temperature between 0.0 and 1.0", + ) + parser.add_argument( + "--granularity", + choices=("segment", "word"), + default="word", + help="Timestamp granularity", + ) + parser.add_argument( + "--min-speakers", + type=int, + default=1, + help="Minimum expected speakers for diarization", + ) + parser.add_argument( + "--max-speakers", + type=int, + default=5, + help="Maximum expected speakers for diarization", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + audio_path = Path(args.audio_file) + + if args.mode == "transcribe": + transcribe( + audio_path=audio_path, + model=args.model, + language=args.language, + prompt=args.prompt, + temperature=args.temperature, + ) + return + + if args.mode == "translate": + translate( + audio_path=audio_path, + model=args.model, + target_language=args.target_language or args.language, + prompt=args.prompt, + temperature=args.temperature, + ) + return + + if args.mode == "diarize": + diarize( + audio_path=audio_path, + model=args.model, + min_speakers=args.min_speakers, + max_speakers=args.max_speakers, + ) + return + + timestamps( + audio_path=audio_path, + model=args.model, + granularity=args.granularity, + ) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-audio/scripts/stt_transcribe.ts b/plugins/togetherai/skills/together-audio/scripts/stt_transcribe.ts new file mode 100644 index 00000000..1188003e --- /dev/null +++ b/plugins/togetherai/skills/together-audio/scripts/stt_transcribe.ts @@ -0,0 +1,194 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI speech-to-text examples with the TypeScript SDK. + * + * Demonstrates: + * - transcription + * - translation + * - diarization + * - timestamps + * + * Usage: + * npx tsx stt_transcribe.ts transcribe audio.mp3 + * npx tsx stt_transcribe.ts translate foreign_audio.mp3 --target-language en + * npx tsx stt_transcribe.ts diarize meeting.mp3 --min-speakers 2 --max-speakers 5 + * npx tsx stt_transcribe.ts timestamps audio.mp3 --granularity word + * + * Requirements: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; +import { readFileSync } from "fs"; + +const client = new Together(); + +type Mode = "transcribe" | "translate" | "diarize" | "timestamps"; + +interface ParsedArgs { + mode: Mode; + audioFile: string; + model: string; + language?: string; + targetLanguage?: string; + prompt?: string; + temperature?: number; + granularity: "segment" | "word"; + minSpeakers: number; + maxSpeakers: number; +} + +function parseArgs(): ParsedArgs { + const raw = process.argv.slice(2); + const get = (flag: string): string | undefined => { + const index = raw.indexOf(flag); + if (index === -1) return undefined; + return raw[index + 1]; + }; + + const mode = raw[0] as Mode | undefined; + const audioFile = raw[1]; + + if (!mode || !audioFile) { + console.error( + "Usage: npx tsx stt_transcribe.ts [options]", + ); + process.exit(1); + } + + if (!["transcribe", "translate", "diarize", "timestamps"].includes(mode)) { + console.error("Mode must be one of: transcribe, translate, diarize, timestamps"); + process.exit(1); + } + + const temperature = get("--temperature"); + return { + mode, + audioFile, + model: get("--model") ?? "openai/whisper-large-v3", + language: get("--language"), + targetLanguage: get("--target-language"), + prompt: get("--prompt"), + temperature: temperature ? Number(temperature) : undefined, + granularity: (get("--granularity") ?? "word") as "segment" | "word", + minSpeakers: Number(get("--min-speakers") ?? "1"), + maxSpeakers: Number(get("--max-speakers") ?? "5"), + }; +} + +function loadAudioFile(filePath: string): File { + const buffer = readFileSync(filePath); + const ext = filePath.split(".").pop()?.toLowerCase() ?? "wav"; + const mimeMap: Record = { + wav: "audio/wav", + mp3: "audio/mpeg", + m4a: "audio/mp4", + webm: "audio/webm", + flac: "audio/flac", + }; + return new File([buffer], filePath, { type: mimeMap[ext] ?? "audio/wav" }); +} + +async function transcribe(args: ParsedArgs): Promise { + const payload: Record = { + file: loadAudioFile(args.audioFile), + model: args.model, + response_format: "json", + }; + if (args.language) payload.language = args.language; + if (args.prompt) payload.prompt = args.prompt; + if (typeof args.temperature === "number") payload.temperature = args.temperature; + + const response = await client.audio.transcriptions.create(payload); + console.log(`Transcription: ${response.text}`); +} + +async function translate(args: ParsedArgs): Promise { + const payload: Record = { + file: loadAudioFile(args.audioFile), + model: args.model, + }; + if (args.targetLanguage ?? args.language) { + payload.language = args.targetLanguage ?? args.language; + } + if (args.prompt) payload.prompt = args.prompt; + if (typeof args.temperature === "number") payload.temperature = args.temperature; + + const response = await client.audio.translations.create(payload); + console.log(`Translation: ${response.text}`); +} + +async function diarize(args: ParsedArgs): Promise { + const response: any = await client.audio.transcriptions.create({ + file: loadAudioFile(args.audioFile), + model: args.model, + response_format: "verbose_json", + diarize: true, + min_speakers: args.minSpeakers, + max_speakers: args.maxSpeakers, + }); + + if (!response.speaker_segments?.length) { + console.log("No speaker segments returned."); + return; + } + + for (const segment of response.speaker_segments) { + console.log( + `[${segment.speaker_id}] ${segment.start.toFixed(1)}s-${segment.end.toFixed(1)}s: ${segment.text}`, + ); + } +} + +async function timestamps(args: ParsedArgs): Promise { + const response: any = await client.audio.transcriptions.create({ + file: loadAudioFile(args.audioFile), + model: args.model, + response_format: "verbose_json", + timestamp_granularities: args.granularity, + }); + + console.log(`Text: ${response.text}`); + console.log(`Language: ${response.language}`); + console.log(`Duration: ${response.duration}s`); + + if (args.granularity === "word" && response.words) { + for (const word of response.words) { + console.log(`'${word.word}' [${word.start.toFixed(2)}s - ${word.end.toFixed(2)}s]`); + } + return; + } + + if (response.segments) { + for (const segment of response.segments) { + console.log(`[${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s] ${segment.text}`); + } + } +} + +async function main(): Promise { + const args = parseArgs(); + + if (args.mode === "transcribe") { + await transcribe(args); + return; + } + + if (args.mode === "translate") { + await translate(args); + return; + } + + if (args.mode === "diarize") { + await diarize(args); + return; + } + + await timestamps(args); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/plugins/togetherai/skills/together-audio/scripts/tts_generate.py b/plugins/togetherai/skills/together-audio/scripts/tts_generate.py new file mode 100644 index 00000000..f5e97b1b --- /dev/null +++ b/plugins/togetherai/skills/together-audio/scripts/tts_generate.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +Together AI text-to-speech examples with the Python v2 SDK. + +Demonstrates: +- REST file generation +- Streaming HTTP generation +- Raw PCM byte output +- Voice discovery + +Usage: + python tts_generate.py --mode rest --text "Hello world" --output speech.mp3 + python tts_generate.py --mode stream --text "Hello world" --output speech_stream.wav + python tts_generate.py --mode raw --text "Hello world" --output speech_raw.pcm + python tts_generate.py --mode voices + +Requirements: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +from __future__ import annotations + +import argparse +import base64 +import struct +from pathlib import Path + +from together import Together +from together import Omit + +client = Together() + + +def _write_wav(pcm_data: bytes, output_file: Path, sample_rate: int = 24000) -> None: + """Wrap raw PCM s16le bytes in a WAV header and write to file.""" + num_channels = 1 + bits_per_sample = 16 + byte_rate = sample_rate * num_channels * bits_per_sample // 8 + block_align = num_channels * bits_per_sample // 8 + data_size = len(pcm_data) + + with open(output_file, "wb") as f: + f.write(b"RIFF") + f.write(struct.pack(" None: + """Generate a complete audio file over HTTP.""" + payload: dict[str, object] = { + "model": model, + "input": text, + "voice": voice, + "response_format": response_format, + } + if language: + payload["language"] = language + if sample_rate is not None: + payload["sample_rate"] = sample_rate + + response = client.audio.speech.create(**payload) + response.write_to_file(str(output_file)) + print(f"Saved {response_format} audio to {output_file}") + + +def generate_stream( + text: str, + output_file: Path, + model: str, + voice: str, + response_encoding: str, + language: str | None, + sample_rate: int | None, + alignment: str, + segment: str, +) -> None: + """Generate streaming audio and save it as a WAV file.""" + extra: dict[str, object] = { + "alignment": alignment, + "segment": segment, + } + response = client.audio.speech.create( + model=model, + input=text, + voice=voice, + stream=True, + response_format="raw", + response_encoding=response_encoding, + language=language if language else Omit(), + sample_rate=sample_rate if sample_rate is not None else Omit(), + extra_body=extra, + ) + raw_chunks: list[bytes] = [] + for chunk in response: + if chunk.type == "conversation.item.audio_output.delta": + raw_chunks.append(base64.b64decode(chunk.delta)) + elif chunk.type == "conversation.item.word_timestamps": + print(f"Timestamps: {chunk.model_extra}") + + pcm_data = b"".join(raw_chunks) + _write_wav(pcm_data, output_file, sample_rate=sample_rate or 24000) + print(f"Saved streaming audio to {output_file}") + + +def generate_raw_bytes( + text: str, + output_file: Path, + model: str, + voice: str, + response_encoding: str, + language: str | None, + sample_rate: int | None, +) -> None: + """Request raw PCM bytes and save them directly.""" + payload: dict[str, object] = { + "model": model, + "input": text, + "voice": voice, + "response_format": "raw", + "response_encoding": response_encoding, + } + if language: + payload["language"] = language + if sample_rate is not None: + payload["sample_rate"] = sample_rate + + response = client.audio.speech.create(**payload) + response.write_to_file(str(output_file)) + print(f"Saved raw audio bytes to {output_file}") + + +def list_voices() -> None: + """List every voice returned by the voices API.""" + response = client.audio.voices.list() + for model_voices in response.data: + print(f"Model: {model_voices.model}") + for voice in model_voices.voices: + print(f" - {voice.name}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI TTS examples") + parser.add_argument( + "--mode", + choices=("rest", "stream", "raw", "voices"), + default="rest", + help="Workflow to run", + ) + parser.add_argument( + "--text", + default="Today is a wonderful day to build something people love!", + help="Input text", + ) + parser.add_argument( + "--output", + default="speech.mp3", + help="Output file path", + ) + parser.add_argument( + "--model", + default="canopylabs/orpheus-3b-0.1-ft", + help="TTS model", + ) + parser.add_argument( + "--voice", + default="tara", + help="Voice identifier", + ) + parser.add_argument( + "--response-format", + choices=("mp3", "wav"), + default="mp3", + help="Output format for REST mode", + ) + parser.add_argument( + "--response-encoding", + choices=("pcm_f32le", "pcm_s16le", "pcm_mulaw", "pcm_alaw"), + default="pcm_s16le", + help="Raw audio encoding for streaming or raw-byte modes", + ) + parser.add_argument( + "--language", + default=None, + help="Optional language code such as en, fr, or es", + ) + parser.add_argument( + "--sample-rate", + type=int, + default=None, + help="Optional output sample rate in Hz", + ) + parser.add_argument( + "--alignment", + choices=("none", "word"), + default="none", + help="Streaming alignment mode", + ) + parser.add_argument( + "--segment", + choices=("sentence", "immediate", "never"), + default="sentence", + help="Streaming segmentation mode", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + output_file = Path(args.output) + + if args.mode == "voices": + list_voices() + return + + if args.mode == "rest": + generate_rest( + text=args.text, + output_file=output_file, + model=args.model, + voice=args.voice, + response_format=args.response_format, + language=args.language, + sample_rate=args.sample_rate, + ) + return + + if args.mode == "stream": + generate_stream( + text=args.text, + output_file=output_file, + model=args.model, + voice=args.voice, + response_encoding=args.response_encoding, + language=args.language, + sample_rate=args.sample_rate, + alignment=args.alignment, + segment=args.segment, + ) + return + + generate_raw_bytes( + text=args.text, + output_file=output_file, + model=args.model, + voice=args.voice, + response_encoding=args.response_encoding, + language=args.language, + sample_rate=args.sample_rate, + ) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-audio/scripts/tts_generate.ts b/plugins/togetherai/skills/together-audio/scripts/tts_generate.ts new file mode 100644 index 00000000..7b8b2282 --- /dev/null +++ b/plugins/togetherai/skills/together-audio/scripts/tts_generate.ts @@ -0,0 +1,203 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI text-to-speech examples with the TypeScript SDK. + * + * Demonstrates: + * - REST file generation + * - Streaming HTTP generation + * - Voice discovery + * + * Usage: + * npx tsx tts_generate.ts --mode rest --text "Hello world" --output speech.mp3 + * npx tsx tts_generate.ts --mode stream --text "Hello world" --output speech_stream.pcm + * npx tsx tts_generate.ts --mode voices + * + * Requirements: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; +import { createWriteStream, promises as fs } from "fs"; +import { Readable } from "stream"; +import { pipeline } from "stream/promises"; + +const client = new Together(); + +type Mode = "rest" | "stream" | "voices"; + +interface ParsedArgs { + mode: Mode; + text: string; + output: string; + model: string; + voice: string; + responseFormat: "mp3" | "wav"; + responseEncoding: "pcm_f32le" | "pcm_s16le" | "pcm_mulaw" | "pcm_alaw"; + language?: string; + sampleRate?: number; + alignment: "none" | "word"; + segment: "sentence" | "immediate" | "never"; +} + +function parseArgs(): ParsedArgs { + const raw = process.argv.slice(2); + const get = (flag: string): string | undefined => { + const index = raw.indexOf(flag); + if (index === -1) return undefined; + return raw[index + 1]; + }; + + const mode = (get("--mode") ?? "rest") as Mode; + const responseFormat = (get("--response-format") ?? "mp3") as ParsedArgs["responseFormat"]; + const responseEncoding = (get("--response-encoding") ?? "pcm_s16le") as ParsedArgs["responseEncoding"]; + const alignment = (get("--alignment") ?? "none") as ParsedArgs["alignment"]; + const segment = (get("--segment") ?? "sentence") as ParsedArgs["segment"]; + const sampleRate = get("--sample-rate"); + + if (!["rest", "stream", "voices"].includes(mode)) { + console.error("Expected --mode to be one of: rest, stream, voices"); + process.exit(1); + } + + return { + mode, + text: get("--text") ?? "Today is a wonderful day to build something people love!", + output: get("--output") ?? "speech.mp3", + model: get("--model") ?? "canopylabs/orpheus-3b-0.1-ft", + voice: get("--voice") ?? "tara", + responseFormat, + responseEncoding, + language: get("--language"), + sampleRate: sampleRate ? Number(sampleRate) : undefined, + alignment, + segment, + }; +} + +async function writeBodyToFile(body: unknown, outputFile: string): Promise { + if (!body) { + throw new Error("Expected a response body"); + } + + if (typeof (body as NodeJS.ReadableStream).pipe === "function") { + await pipeline(body as NodeJS.ReadableStream, createWriteStream(outputFile)); + return; + } + + if (typeof (body as ReadableStream).getReader === "function") { + const nodeStream = Readable.fromWeb(body as ReadableStream); + await pipeline(nodeStream, createWriteStream(outputFile)); + return; + } + + throw new Error("Unsupported response body type"); +} + +function parseStreamingPayloads(chunk: unknown): Array> { + if (chunk && typeof chunk === "object" && "type" in (chunk as Record)) { + return [chunk as Record]; + } + + const text = + typeof chunk === "string" + ? chunk + : chunk instanceof Uint8Array + ? Buffer.from(chunk).toString("utf8") + : Buffer.isBuffer(chunk) + ? chunk.toString("utf8") + : ""; + + if (!text) return []; + + const payloads: Array> = []; + for (const line of text.split("\n")) { + if (!line.startsWith("data:")) continue; + const data = line.slice(5).trim(); + if (!data || data === "[DONE]") continue; + try { + payloads.push(JSON.parse(data) as Record); + } catch { + // Ignore malformed SSE lines. + } + } + return payloads; +} + +async function generateRest(args: ParsedArgs): Promise { + const payload: Record = { + model: args.model, + input: args.text, + voice: args.voice, + response_format: args.responseFormat, + }; + if (args.language) payload.language = args.language; + if (typeof args.sampleRate === "number") payload.sample_rate = args.sampleRate; + + const response = await client.audio.speech.create(payload); + await writeBodyToFile(response.body, args.output); + console.log(`Saved ${args.responseFormat} audio to ${args.output}`); +} + +async function generateStream(args: ParsedArgs): Promise { + const payload: Record = { + model: args.model, + input: args.text, + voice: args.voice, + stream: true, + response_format: "raw", + response_encoding: args.responseEncoding, + alignment: args.alignment, + segment: args.segment, + }; + if (args.language) payload.language = args.language; + if (typeof args.sampleRate === "number") payload.sample_rate = args.sampleRate; + + const response = await client.audio.speech.create(payload); + const rawChunks: Buffer[] = []; + + for await (const chunk of response as AsyncIterable) { + for (const event of parseStreamingPayloads(chunk)) { + if (event.type === "conversation.item.audio_output.delta" && typeof event.delta === "string") { + rawChunks.push(Buffer.from(event.delta, "base64")); + } + if (event.type === "conversation.item.word_timestamps") { + console.log(JSON.stringify(event)); + } + } + } + + await fs.writeFile(args.output, Buffer.concat(rawChunks)); + console.log(`Saved raw streaming audio to ${args.output}`); +} + +async function listVoices(): Promise { + const response = await client.audio.voices.list(); + for (const modelVoices of response.data ?? []) { + console.log(`Model: ${modelVoices.model}`); + for (const voice of modelVoices.voices ?? []) { + console.log(` - ${voice.name}`); + } + } +} + +async function main(): Promise { + const args = parseArgs(); + + if (args.mode === "voices") { + await listVoices(); + return; + } + + if (args.mode === "rest") { + await generateRest(args); + return; + } + + await generateStream(args); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/plugins/togetherai/skills/together-audio/scripts/tts_websocket.py b/plugins/togetherai/skills/together-audio/scripts/tts_websocket.py new file mode 100644 index 00000000..8176e434 --- /dev/null +++ b/plugins/togetherai/skills/together-audio/scripts/tts_websocket.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Together AI realtime text-to-speech over WebSocket. + +Sends text to the realtime TTS WebSocket API, saves raw PCM audio, and optionally +prints word timestamps when `alignment=word` is enabled. + +Usage: + python tts_websocket.py --text "Hello from Together AI" --output speech_ws.pcm + python tts_websocket.py --text "Hello from Together AI" --model hexgrad/Kokoro-82M --voice af_alloy + +Requirements: + pip install websockets + export TOGETHER_API_KEY=your_key +""" + +from __future__ import annotations + +import argparse +import asyncio +import base64 +import json +import os +from pathlib import Path +from urllib.parse import urlencode + +import websockets + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI realtime TTS example") + parser.add_argument( + "--text", + default="Hello from Together AI.", + help="Text to synthesize", + ) + parser.add_argument( + "--output", + default="speech_ws.pcm", + help="PCM output file", + ) + parser.add_argument( + "--model", + default="hexgrad/Kokoro-82M", + help="TTS model", + ) + parser.add_argument( + "--voice", + default="af_alloy", + help="Voice ID", + ) + parser.add_argument( + "--sample-rate", + type=int, + default=24000, + help="Output sample rate in Hz", + ) + parser.add_argument( + "--response-format", + default="pcm", + choices=("pcm", "wav", "mp3", "opus", "aac", "flac"), + help="Realtime audio format", + ) + parser.add_argument( + "--alignment", + choices=("none", "word"), + default="none", + help="Whether to request word timestamps", + ) + parser.add_argument( + "--segment", + choices=("sentence", "immediate", "never"), + default="sentence", + help="Text segmentation behavior", + ) + parser.add_argument( + "--max-partial-length", + type=int, + default=250, + help="Buffer size before forced synthesis", + ) + return parser.parse_args() + + +async def synthesize(args: argparse.Namespace) -> None: + api_key = os.environ["TOGETHER_API_KEY"] + query = urlencode( + { + "model": args.model, + "voice": args.voice, + "response_format": args.response_format, + "sample_rate": args.sample_rate, + "alignment": args.alignment, + "segment": args.segment, + "max_partial_length": args.max_partial_length, + } + ) + url = f"wss://api.together.ai/v1/audio/speech/websocket?{query}" + headers = {"Authorization": f"Bearer {api_key}"} + + output = bytearray() + + async with websockets.connect(url, additional_headers=headers) as ws: + session_message = json.loads(await ws.recv()) + print(f"Session created: {session_message['session']['id']}") + + await ws.send(json.dumps({"type": "input_text_buffer.append", "text": args.text})) + await ws.send(json.dumps({"type": "input_text_buffer.commit"})) + + async for message in ws: + event = json.loads(message) + event_type = event.get("type") + + if event_type == "conversation.item.audio_output.delta": + output.extend(base64.b64decode(event["delta"])) + elif event_type == "conversation.item.word_timestamps": + print(json.dumps(event, indent=2)) + elif event_type == "conversation.item.audio_output.done": + break + elif event_type in {"conversation.item.tts.failed", "error"}: + raise RuntimeError(json.dumps(event)) + + output_path = Path(args.output) + output_path.write_bytes(output) + print(f"Saved realtime audio to {output_path}") + + if args.response_format == "pcm": + print(f"Play with: ffplay -f s16le -ar {args.sample_rate} {output_path}") + + +def main() -> None: + args = parse_args() + asyncio.run(synthesize(args)) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-batch-inference/SKILL.md b/plugins/togetherai/skills/together-batch-inference/SKILL.md new file mode 100644 index 00000000..5af168a0 --- /dev/null +++ b/plugins/togetherai/skills/together-batch-inference/SKILL.md @@ -0,0 +1,70 @@ +--- +name: together-batch-inference +description: "High-volume, asynchronous offline inference at up to 50% lower cost via Together AI's Batch API. Prepare JSONL inputs, upload files, create jobs, poll status, and download outputs. Reach for it whenever the user needs non-interactive bulk inference rather than real-time chat or evaluation jobs." +--- + +# Together Batch Inference + +## Overview + +Use Together AI's Batch API for large offline workloads where latency is not the primary concern. + +Typical fits: + +- bulk classification +- synthetic data generation +- dataset transformations +- large summarization or enrichment jobs +- low-cost asynchronous inference + +## When This Skill Wins + +- The user has many independent requests to run +- A JSONL request file is acceptable +- Turnaround time can be minutes or hours instead of seconds +- Lower cost matters more than immediate interactivity + +## Hand Off To Another Skill + +- Use `together-chat-completions` for real-time requests or tool-calling apps +- Use `together-evaluations` for managed LLM-as-a-judge workflows +- Use `together-embeddings` for retrieval-specific vector generation + +## Quick Routing + +- End-to-end batch workflow + - Start with [scripts/batch_workflow.py](scripts/batch_workflow.py) or [scripts/batch_workflow.ts](scripts/batch_workflow.ts) +- Request format, status model, and result downloads + - Read [references/api-reference.md](references/api-reference.md) +- Operational guidance and batch sizing + - Read [references/api-reference.md](references/api-reference.md) + +## Workflow + +1. Build a JSONL file where each line contains `custom_id` and `body`. +2. Upload the file with `purpose="batch-api"`. +3. Create the batch with `input_file_id=...` and the target endpoint. +4. Poll until the job is terminal. +5. Download output and error files, then reconcile by `custom_id`. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Use `input_file_id`, not legacy file parameters. +- Keep `custom_id` stable and meaningful so result reconciliation is easy. +- Batch is for independent requests. If the workload depends on shared conversation state, it is probably the wrong tool. +- Always inspect the error file in addition to the success output. +- `client.batches.create()` returns a wrapper; access the batch object via `response.job` (e.g., `response.job.id`). `client.batches.retrieve()` returns the batch object directly. +- For classification or labeling workloads, set `max_tokens` low (e.g., 4), use `temperature: 0`, and constrain the system prompt to return only the label. This minimizes output tokens and cost. +- Small batches (under 1K requests) typically complete in minutes. The 24-hour completion window is a maximum, not typical. + +## Resource Map + +- API reference and operational guidance: [references/api-reference.md](references/api-reference.md) +- Python workflow: [scripts/batch_workflow.py](scripts/batch_workflow.py) +- TypeScript workflow: [scripts/batch_workflow.ts](scripts/batch_workflow.ts) + +## Official Docs + +- [Batch Inference](https://docs.together.ai/docs/batch-inference) +- [Batch API](https://docs.together.ai/reference/batch-create) diff --git a/plugins/togetherai/skills/together-batch-inference/references/api-reference.md b/plugins/togetherai/skills/together-batch-inference/references/api-reference.md new file mode 100644 index 00000000..5b240c10 --- /dev/null +++ b/plugins/togetherai/skills/together-batch-inference/references/api-reference.md @@ -0,0 +1,311 @@ +# Batch Inference API Reference +## Contents + +- [Endpoints](#endpoints) +- [Input File Format (JSONL)](#input-file-format) +- [Output File Format (JSONL)](#output-file-format) +- [Create Batch Request](#create-batch-request) +- [Batch Job Object (Response)](#batch-job-object) +- [Batch Job Status](#batch-job-status) +- [Workflow](#workflow) +- [Models with 50% Discount](#models-with-50-discount) +- [Rate Limits](#rate-limits) +- [Error Handling](#error-handling) +- [Best Practices](#best-practices) +- [Error Codes](#error-codes) +- [CLI Commands](#cli-commands) + + +## Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /batches` | Create batch | Submit a new batch job | +| `GET /batches` | List batches | List all batch jobs | +| `GET /batches/{id}` | Get batch | Get batch details | +| `POST /batches/{id}/cancel` | Cancel batch | Cancel a batch job | + +Base URL: `https://api.together.xyz/v1` +Authentication: `Authorization: Bearer $TOGETHER_API_KEY` + +## Input File Format (JSONL) + +Each line is a JSON object with two required fields: + +```json +{"custom_id": "request-1", "body": {"model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "messages": [{"role": "user", "content": "Hello"}], "max_tokens": 200}} +``` + +- `custom_id` (string, required): Unique identifier for tracking (max 64 chars) +- `body` (object, required): Request matching the `/v1/chat/completions` schema + +## Output File Format (JSONL) + +Each line in the output file is a JSON object keyed by `custom_id`: + +```json +{"custom_id": "request-1", "response": {"status_code": 200, "body": {"id": "...", "object": "chat.completion", "model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello!"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 12, "completion_tokens": 3, "total_tokens": 15}}}} +``` + +To extract the assistant's reply from a result line: + +```python +content = ( + result["response"]["body"]["choices"][0]["message"]["content"] +) +``` + +## Create Batch Request + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `endpoint` | string | Yes | API endpoint (`/v1/chat/completions`) | +| `input_file_id` | string | Yes | ID of the uploaded input file | +| `completion_window` | string | No | Time window for completion (default: `24h`) | +| `priority` | integer | No | Priority for batch processing | +| `model_id` | string | No | Model to use for processing | + +## Batch Job Object (Response) + +| Field | Type | Description | +|-------|------|-------------| +| `id` | string (UUID) | Unique batch job identifier | +| `user_id` | string | Associated user ID | +| `input_file_id` | string | Input file reference | +| `file_size_bytes` | integer | Size of input file in bytes | +| `status` | enum | Job state (see Status table below) | +| `endpoint` | string | API endpoint used | +| `progress` | float | Completion percentage (0.0 to 100.0) | +| `model_id` | string | Model used for processing | +| `output_file_id` | string | Results file reference (available on COMPLETED) | +| `error_file_id` | string | Error file reference (available on failure) | +| `error` | string | Error message (if applicable) | +| `job_deadline` | datetime | Deadline for completion | +| `created_at` | datetime | Creation timestamp | +| `completed_at` | datetime | Completion timestamp | + +## Batch Job Status + +| Status | Description | +|--------|-------------| +| `VALIDATING` | Input file being validated | +| `IN_PROGRESS` | Processing requests | +| `COMPLETED` | All requests processed | +| `FAILED` | Processing failed | +| `EXPIRED` | Job exceeded time limit | +| `CANCELLED` | User cancelled | + +## Workflow + +### 1. Upload Input File + +Pass `check=False` to skip client-side file validation and let the server validate during the `VALIDATING` phase. + +```python +from together import Together + +client = Together() +file_resp = client.files.upload(file="batch_input.jsonl", purpose="batch-api", check=False) +print(file_resp.id) # file-abc123 +``` + +```typescript +import Together from "together-ai"; + +const client = new Together(); +const fileResp = await client.files.upload("batch_input.jsonl", "batch-api", false); +console.log(fileResp.id); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/files" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -F "purpose=batch-api" \ + -F "file=@batch_input.jsonl" +``` + +### 2. Create Batch + +Note: `create()` returns a wrapper object. Access the batch via `.job`: + +```python +response = client.batches.create( + input_file_id=file_resp.id, + endpoint="/v1/chat/completions", +) +batch = response.job +print(batch.id) # batch-abc123 +``` + +```typescript +const response = await client.batches.create({ + endpoint: "/v1/chat/completions", + input_file_id: fileResp.id, +}); +console.log(response.job?.id); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/batches" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"input_file_id": "file-abc123", "endpoint": "/v1/chat/completions"}' +``` + +### 3. Check Status + +Unlike `create()`, `retrieve()` returns the batch object directly (no `.job` wrapper): + +```python +batch = client.batches.retrieve(batch.id) +print(batch.status) # VALIDATING, IN_PROGRESS, COMPLETED, FAILED +print(batch.progress) # 0.0 to 100.0 +``` + +```typescript +const batchId = batch.job?.id; +const batchInfo = await client.batches.retrieve(batchId); +console.log(batchInfo.status); +``` + +```shell +curl -X GET "https://api.together.xyz/v1/batches/batch-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +### 4. Download Results + +```python +if status.status == "COMPLETED": + with client.files.with_streaming_response.content(id=status.output_file_id) as response: + with open("batch_output.jsonl", "wb") as f: + for chunk in response.iter_bytes(): + f.write(chunk) +``` + +```typescript +const batchResult = await client.batches.retrieve(batchId); + +if (batchResult.status === "COMPLETED" && batchResult.output_file_id) { + const resp = await client.files.content(batchResult.output_file_id); + const result = await resp.text(); + console.log(result); +} +``` + +```shell +# First retrieve the batch to get output_file_id, then download: +curl -X GET "https://api.together.xyz/v1/files/file-output456/content" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -o batch_output.jsonl +``` + +### 5. Cancel Batch + +```python +client.batches.cancel("batch-abc123") +``` + +```typescript +const cancelled = await client.batches.cancel("batch-abc123"); +console.log(cancelled); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/batches/batch-abc123/cancel" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +### 6. List Batches + +```python +batches = client.batches.list() +for batch in batches: + print(batch) +``` + +```typescript +const allBatches = await client.batches.list(); +for (const batch of allBatches ?? []) { + console.log(batch); +} +``` + +```shell +curl -X GET "https://api.together.xyz/v1/batches" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +## Models with 50% Discount + +- `meta-llama/Llama-3.3-70B-Instruct-Turbo` + +Most serverless models support batch processing through the chat completions endpoint; models not listed above have no discount. The following serverless models are not currently available for batch and will fail if submitted: + +- `deepseek-ai/DeepSeek-R1` +- `deepseek-ai/DeepSeek-V3.1` +- `deepseek-ai/DeepSeek-V4-Pro` +- `MiniMaxAI/MiniMax-M2.7` +- `moonshotai/Kimi-K2.5` +- `moonshotai/Kimi-K2.6` +- `Qwen/Qwen3.5-397B-A17B` +- `zai-org/GLM-5` +- `zai-org/GLM-5.1` + +## Rate Limits + +| Limit | Value | +|-------|-------| +| Max requests per batch | 50,000 | +| Max file size | 100MB | +| Max tokens enqueued per model | 30B | +| Recommended batch size | 1,000-10,000 | + +Batch API rate limits are separate from standard per-model rate limits. + +## Error Handling + +Per-request errors are recorded in a separate file accessible via `error_file_id`: + +```jsonl +{"custom_id": "req-1", "error": {"message": "Invalid model specified", "code": "invalid_model"}} +{"custom_id": "req-5", "error": {"message": "Request timeout", "code": "timeout"}} +``` + +## Best Practices + +- Aim for 1,000-10,000 requests per batch unless you have a strong reason to go smaller or larger +- Validate JSONL before submission to avoid wasting a full batch run on malformed input +- Use unique `custom_id` values so output and error rows can be reconciled deterministically +- Poll status every 30-60 seconds rather than tight-looping the API +- Small batches (under 1K requests) typically complete in minutes; most batches finish within 24 hours; allow up to 72 hours for very large or complex runs +- Reuse uploaded batch files across multiple jobs when the request set is unchanged + +## Error Codes + +| Code | Description | Solution | +|------|-------------|----------| +| 400 | Invalid request format | Check JSONL syntax and required fields | +| 401 | Authentication failed | Verify API key | +| 404 | Batch not found | Check batch ID | +| 429 | Rate limit exceeded | Reduce request frequency | +| 500 | Server error | Retry with exponential backoff | + +## CLI Commands + +```shell +# Upload file +together files upload batch_input.jsonl --purpose batch-api + +# Create batch +together batches create --input-file file-abc123 --endpoint /v1/chat/completions + +# Check status +together batches retrieve batch-abc123 + +# List batches +together batches list + +# Cancel +together batches cancel batch-abc123 +``` diff --git a/plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.py b/plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.py new file mode 100644 index 00000000..fd659149 --- /dev/null +++ b/plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Together AI Batch Inference - Full Workflow (v2 SDK) + +End-to-end: prepare JSONL -> upload -> create batch -> poll -> download results. + +Usage: + python batch_workflow.py + python batch_workflow.py --prompt "Classify this review: great product" --prompt "Summarize this note" + python batch_workflow.py --input-jsonl requests.jsonl --output-path results.jsonl + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import json +import tempfile +import time +from pathlib import Path + +from together import Together + +client = Together() + + +def build_sample_requests(prompts: list[str], model: str, max_tokens: int) -> list[dict]: + """Build a small batch request list from prompts.""" + return [ + { + "custom_id": f"req-{index}", + "body": { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + }, + } + for index, prompt in enumerate(prompts, start=1) + ] + + +def load_requests_from_jsonl(path: str) -> list[dict]: + """Load batch requests from a JSONL file.""" + with open(path, encoding="utf-8") as handle: + return [json.loads(line) for line in handle if line.strip()] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI Batch API workflow") + parser.add_argument("--input-jsonl", help="Path to a JSONL file containing batch requests") + parser.add_argument( + "--prompt", + action="append", + default=[], + help="Prompt to include in a generated sample batch payload. Repeat for multiple prompts.", + ) + parser.add_argument( + "--model", + default="Qwen/Qwen2.5-7B-Instruct-Turbo", + help="Model to use when generating a sample batch payload", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=128, + help="max_tokens value for generated sample requests", + ) + parser.add_argument( + "--output-path", + default="batch_results.jsonl", + help="Where to save the batch output JSONL file", + ) + parser.add_argument( + "--error-path", + default="batch_errors.jsonl", + help="Where to save the batch error JSONL file", + ) + parser.add_argument( + "--poll-interval", + type=int, + default=10, + help="Seconds between status checks", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + prompts = args.prompt or [ + "What is the capital of France?", + "Explain quantum computing in one sentence.", + ] + requests = ( + load_requests_from_jsonl(args.input_jsonl) + if args.input_jsonl + else build_sample_requests(prompts=prompts, model=args.model, max_tokens=args.max_tokens) + ) + + # --- 1. Prepare batch input file --- + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as temp_file: + for req in requests: + temp_file.write(json.dumps(req) + "\n") + input_path = Path(temp_file.name) + + print(f"Wrote {len(requests)} requests to {input_path}") + + # --- 2. Upload input file --- + try: + file_response = client.files.upload(file=str(input_path), purpose="batch-api", check=False) + finally: + input_path.unlink(missing_ok=True) + file_id = file_response.id + print(f"Uploaded file: {file_id}") + + # --- 3. Create batch job --- + response = client.batches.create( + input_file_id=file_id, + endpoint="/v1/chat/completions", + ) + batch = response.job + print(f"Created batch: {batch.id} (status: {batch.status})") + + # --- 4. Poll for completion --- + while True: + batch = client.batches.retrieve(batch.id) + print(f" Status: {batch.status} | Progress: {batch.progress:.0f}%") + + if batch.status == "COMPLETED": + break + if batch.status in ("FAILED", "EXPIRED", "CANCELLED"): + print(f"Batch ended with status: {batch.status}") + if batch.error: + print(f"Error: {batch.error}") + raise SystemExit(1) + + time.sleep(args.poll_interval) + + # --- 5. Download results --- + if batch.output_file_id: + with client.files.with_streaming_response.content(id=batch.output_file_id) as output_response: + with open(args.output_path, "wb") as handle: + for chunk in output_response.iter_bytes(): + handle.write(chunk) + print(f"\nResults saved to {args.output_path}") + + with open(args.output_path, encoding="utf-8") as handle: + for line in handle: + result = json.loads(line) + custom_id = result.get("custom_id", "?") + content = ( + result.get("response", {}) + .get("body", {}) + .get("choices", [{}])[0] + .get("message", {}) + .get("content", "") + ) + print(f" [{custom_id}] {content[:100]}") + + # --- 6. Check for errors --- + if batch.error_file_id: + with client.files.with_streaming_response.content(id=batch.error_file_id) as error_response: + with open(args.error_path, "wb") as handle: + for chunk in error_response.iter_bytes(): + handle.write(chunk) + print(f"Errors saved to {args.error_path}") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.ts b/plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.ts new file mode 100644 index 00000000..208d25ab --- /dev/null +++ b/plugins/togetherai/skills/together-batch-inference/scripts/batch_workflow.ts @@ -0,0 +1,118 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Batch Inference - Full Workflow (TypeScript SDK) + * + * End-to-end: prepare JSONL -> upload -> create batch -> poll -> download results. + * + * Usage: + * npx tsx batch_workflow.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; +import * as fs from "fs"; +import * as os from "os"; +import * as path from "path"; + +const client = new Together(); + +async function main() { + // --- 1. Prepare batch input file --- + const requests = [ + { + custom_id: "req-1", + body: { + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages: [{ role: "user", content: "What is the capital of France?" }], + max_tokens: 128, + }, + }, + { + custom_id: "req-2", + body: { + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages: [ + { + role: "user", + content: "Explain quantum computing in one sentence.", + }, + ], + max_tokens: 128, + }, + }, + ]; + + const inputPath = path.join(os.tmpdir(), `batch_input_${Date.now()}.jsonl`); + const lines = requests.map((r) => JSON.stringify(r)).join("\n") + "\n"; + fs.writeFileSync(inputPath, lines); + console.log(`Wrote ${requests.length} requests to ${inputPath}`); + + // --- 2. Upload input file --- + const fileResp = await client.files.upload(inputPath, "batch-api", false); + const fileId = fileResp.id; + console.log(`Uploaded file: ${fileId}`); + + // --- 3. Create batch job --- + const response = await client.batches.create({ + input_file_id: fileId, + endpoint: "/v1/chat/completions", + }); + const batchId = response.job?.id; + if (!batchId) { + throw new Error("Batch create response did not include a job id."); + } + console.log(`Created batch: ${batchId}`); + + // --- 4. Poll for completion --- + let batch: any; + while (true) { + batch = await client.batches.retrieve(batchId); + console.log( + ` Status: ${batch.status} | Progress: ${(batch.progress ?? 0).toFixed(0)}%` + ); + + if (batch.status === "COMPLETED") { + break; + } else if (["FAILED", "EXPIRED", "CANCELLED"].includes(batch.status)) { + console.error(`Batch ended with status: ${batch.status}`); + if (batch.error) console.error(`Error: ${batch.error}`); + process.exit(1); + } + + await new Promise((resolve) => setTimeout(resolve, 10_000)); + } + + // --- 5. Download results --- + if (batch.output_file_id) { + const resp = await client.files.content(batch.output_file_id); + const text = await resp.text(); + const outputPath = "batch_results.jsonl"; + fs.writeFileSync(outputPath, text); + console.log(`\nResults saved to ${outputPath}`); + + for (const line of text.trim().split("\n")) { + const result = JSON.parse(line); + const customId = result.custom_id ?? "?"; + const content = + result.response?.body?.choices?.[0]?.message?.content ?? ""; + console.log(` [${customId}] ${content.slice(0, 100)}`); + } + } + + // --- 6. Check for errors --- + if (batch.error_file_id) { + const errResp = await client.files.content(batch.error_file_id); + const errText = await errResp.text(); + const errorPath = "batch_errors.jsonl"; + fs.writeFileSync(errorPath, errText); + console.log(`Errors saved to ${errorPath}`); + } +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/plugins/togetherai/skills/together-chat-completions/SKILL.md b/plugins/togetherai/skills/together-chat-completions/SKILL.md new file mode 100644 index 00000000..76eacf55 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/SKILL.md @@ -0,0 +1,116 @@ +--- +name: together-chat-completions +description: "Real-time and streaming text generation via Together AI's OpenAI-compatible chat/completions API, including multi-turn conversations, tool and function calling, structured JSON outputs, and reasoning models. Reach for it whenever the user wants to build or debug text generation on Together AI, unless they specifically need batch jobs, embeddings, fine-tuning, dedicated endpoints, dedicated containers, or GPU clusters." +--- + +# Together Chat Completions + +## Overview + +Use Together AI's serverless chat/completions API for interactive inference workloads: + +- basic text generation +- streaming responses +- multi-turn chat state +- tool and function calling +- structured outputs +- reasoning-capable models + +Treat this skill as the default entry point for Together AI text generation unless the task is +clearly offline batch processing, vector retrieval, model training, or infrastructure management. + +## When This Skill Wins + +- Build a chatbot, assistant, or text-generation endpoint on Together AI +- Add streaming output to a real-time user experience +- Implement tool calling or function-calling loops +- Constrain model output to JSON or a regex-defined shape +- Choose between standard chat models and reasoning models +- Debug request parameters, model behavior, or response shapes + +## Hand Off To Another Skill + +- Use `together-batch-inference` for large offline runs, backfills, or lower-cost asynchronous jobs +- Use `together-embeddings` for vector search, semantic retrieval, or reranking +- Use `together-fine-tuning` when the user wants to train or adapt a model +- Use `together-dedicated-endpoints` when the user needs always-on single-tenant hosting +- Use `together-dedicated-containers` or `together-gpu-clusters` for custom infrastructure + +## Quick Routing + +- Basic chat, streaming, or multi-turn state + - Start with [references/api-parameters.md](references/api-parameters.md) + - Use [scripts/chat_basic.py](scripts/chat_basic.py) or [scripts/chat_basic.ts](scripts/chat_basic.ts) +- OpenAI SDK migration, rate limits, or debug headers + - Read [references/api-parameters.md](references/api-parameters.md) + - Use [scripts/debug_headers.py](scripts/debug_headers.py) or [scripts/debug_headers.ts](scripts/debug_headers.ts) +- Parallel async requests + - Use [scripts/async_parallel.py](scripts/async_parallel.py) +- Tool calling or function calling + - Read [references/function-calling-patterns.md](references/function-calling-patterns.md) + - Start from [scripts/tool_call_loop.py](scripts/tool_call_loop.py) or [scripts/tool_call_loop.ts](scripts/tool_call_loop.ts) +- Designing tools, schemas, or tool_choice for reliability + - Read the "Best Practices" section in [references/function-calling-patterns.md](references/function-calling-patterns.md) +- Structured outputs + - Read [references/structured-outputs.md](references/structured-outputs.md) + - Start from [scripts/structured_outputs.py](scripts/structured_outputs.py) or [scripts/structured_outputs.ts](scripts/structured_outputs.ts) +- Reasoning models or thinking-mode toggles + - Read [references/reasoning-models.md](references/reasoning-models.md) + - Start from [scripts/reasoning_models.py](scripts/reasoning_models.py) or [scripts/reasoning_models.ts](scripts/reasoning_models.ts) +- Combining tools + structured output, or tools + streaming + - Read the "Combining Tool Calls with Structured Output" section in + [references/function-calling-patterns.md](references/function-calling-patterns.md) + - Read the "Streaming Structured Output" section in + [references/structured-outputs.md](references/structured-outputs.md) +- Model selection, context length, or pricing-aware choices + - Read [references/models.md](references/models.md) + +## Workflow + +1. Confirm that the workload is interactive serverless inference rather than batch, retrieval, or training. +2. Pick the smallest model that satisfies latency, quality, and context requirements. +3. Decide whether the job needs plain text, tools, structured output, or reasoning. +4. Start from the matching script instead of re-deriving request shapes from scratch. +5. Pull deeper details from the relevant reference file only when needed. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Use `client.chat.completions.create()` for Python and `client.chat.completions.create()` for TypeScript. +- Preserve full `messages` history for multi-turn conversations; do not rebuild context from final text only. +- For tools, implement the full loop: model tool call -> execute tool -> append tool result -> second model call. +- For tool definitions, prefer `enum` over free-form strings, set `"additionalProperties": false`, and add `"strict": true` on the function definition when you need argument generation to conform to the schema. +- Tool names must not contain spaces, periods, or dashes. Branch on `finish_reason` (`"tool_calls"` vs `"stop"`) instead of assuming a tool was called, and parse `function.arguments` as JSON inside a try/except. +- Prefer `json_schema` over looser JSON modes when the user needs stable machine-readable output. +- Use reasoning models only when the task benefits from deeper deliberation; otherwise prefer cheaper standard models. +- To combine tool calling with structured output, use a two-phase approach: Phase 1 sends `tools` (no `response_format`), Phase 2 sends `response_format` (no `tools`) after tool results are appended. +- Streaming works with `response_format`; accumulate chunks and parse the final concatenated string as JSON. +- If the user needs many independent requests, combine this skill with `async_parallel.py` or hand off to batch inference. + +## Resource Map + +- Parameters and response fields: [references/api-parameters.md](references/api-parameters.md) +- OpenAI compatibility, rate-limit headers, and debug headers: [references/api-parameters.md](references/api-parameters.md) +- Function-calling patterns: [references/function-calling-patterns.md](references/function-calling-patterns.md) +- Structured outputs: [references/structured-outputs.md](references/structured-outputs.md) +- Reasoning models: [references/reasoning-models.md](references/reasoning-models.md) +- Model catalog: [references/models.md](references/models.md) + +## Scripts + +- [scripts/chat_basic.py](scripts/chat_basic.py) and [scripts/chat_basic.ts](scripts/chat_basic.ts): basic chat, streaming, and multi-turn state +- [scripts/debug_headers.py](scripts/debug_headers.py) and [scripts/debug_headers.ts](scripts/debug_headers.ts): raw-response inspection for routing, latency, and rate-limit headers +- [scripts/async_parallel.py](scripts/async_parallel.py): async Python fan-out for independent requests +- [scripts/tool_call_loop.py](scripts/tool_call_loop.py) and [scripts/tool_call_loop.ts](scripts/tool_call_loop.ts): full tool-call loop +- [scripts/structured_outputs.py](scripts/structured_outputs.py) and [scripts/structured_outputs.ts](scripts/structured_outputs.ts): schema-guided and regex outputs +- [scripts/reasoning_models.py](scripts/reasoning_models.py) and [scripts/reasoning_models.ts](scripts/reasoning_models.ts): reasoning fields, effort, and hybrid toggles + +## Official Docs + +- [Chat Overview](https://docs.together.ai/docs/chat-overview) +- [Inference Parameters](https://docs.together.ai/docs/inference-parameters) +- [Serverless Models](https://docs.together.ai/docs/serverless-models) +- [Function Calling](https://docs.together.ai/docs/function-calling) +- [JSON Mode](https://docs.together.ai/docs/json-mode) +- [Reasoning Overview](https://docs.together.ai/docs/reasoning-overview) +- [Chat Completions API](https://docs.together.ai/reference/chat-completions) diff --git a/plugins/togetherai/skills/together-chat-completions/references/api-parameters.md b/plugins/togetherai/skills/together-chat-completions/references/api-parameters.md new file mode 100644 index 00000000..ff2e06c8 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/references/api-parameters.md @@ -0,0 +1,474 @@ +# Chat Completions API Parameters +## Contents + +- [Required Parameters](#required-parameters) +- [Generation Parameters](#generation-parameters) +- [Output Control](#output-control) +- [Response Format](#response-format) +- [Function Calling](#function-calling) +- [Safety & Compliance](#safety-compliance) +- [Reasoning](#reasoning) +- [Context Handling](#context-handling) +- [Message Object](#message-object) +- [OpenAI Compatibility](#openai-compatibility) +- [Rate Limits & Build Tiers](#rate-limits-build-tiers) +- [Debug Mode](#debug-mode) +- [HTTP Status Codes](#http-status-codes) + + +## Required Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `model` | string | Model identifier (e.g., `meta-llama/Llama-3.3-70B-Instruct-Turbo`) | +| `messages` | array | Array of message objects with `role` and `content` | + +## Generation Parameters + +| Parameter | Type | Default | Range | Description | +|-----------|------|---------|-------|-------------| +| `max_tokens` | integer | varies | 1+ | Maximum tokens to generate | +| `temperature` | float | varies | 0-2 | Randomness. Lower = more deterministic | +| `top_p` | float | 1.0 | 0-1 | Nucleus sampling threshold | +| `top_k` | integer | - | 1+ | Limit choices per token step | +| `min_p` | float | - | 0-1 | Alternative to top_p/top_k | +| `repetition_penalty` | float | 1.0 | - | Higher = less repetition | +| `presence_penalty` | float | 0 | -2.0 to 2.0 | Penalize tokens already present | +| `frequency_penalty` | float | 0 | -2.0 to 2.0 | Penalize frequent tokens | +| `stop` | string[] | - | - | Sequences that stop generation | +| `n` | integer | 1 | 1-128 | Number of completions to generate | +| `seed` | integer | - | - | For reproducible outputs | + +### Python Example + +```python +from together import Together + +client = Together() + +response = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello!"}, + ], + max_tokens=512, + temperature=0.7, + top_p=0.9, + stop=["END"], +) +print(response.choices[0].message.content) +``` + +### TypeScript Example + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const response = await together.chat.completions.create({ + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages: [ + { role: "system", content: "You are a helpful assistant" }, + { role: "user", content: "Hello!" }, + ], + max_tokens: 512, + temperature: 0.7, + top_p: 0.9, + stop: ["END"], +}); +console.log(response.choices[0].message.content); +``` + +## Output Control + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `stream` | bool | false | Stream tokens as Server-Sent Events | +| `logprobs` | integer | - | Return top-k token log probs (0-20) | +| `echo` | bool | false | Include prompt in response | +| `logit_bias` | object | - | Token ID to bias value mapping | + +### Streaming Example (Python) + +```python +stream = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages=[{"role": "user", "content": "Write a story"}], + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + +### Streaming Example (TypeScript) + +```typescript +const stream = await together.chat.completions.create({ + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages: [{ role: "user", content: "Write a story" }], + stream: true, +}); + +for await (const chunk of stream) { + process.stdout.write(chunk.choices[0]?.delta?.content || ""); +} +``` + +## Response Format + +| Parameter | Type | Description | +|-----------|------|-------------| +| `response_format` | object | Control output structure | + +Options: + +```python +# Plain text (default) +response_format={"type": "text"} + +# JSON object (model decides structure, guided by prompt) +response_format={"type": "json_object"} + +# JSON schema (constrained to your schema) -- nested format with name +response_format={ + "type": "json_schema", + "json_schema": { + "name": "my_schema", + "schema": {...}, + }, +} + +# Regex pattern matching +response_format={"type": "regex", "pattern": "(positive|neutral|negative)"} +``` + +```typescript +// JSON schema (TypeScript with Zod) +import { z } from "zod"; + +const schema = z.object({ name: z.string(), age: z.number() }); +const jsonSchema = z.toJSONSchema(schema); + +response_format: { + type: "json_schema", + json_schema: { + name: "person", + schema: jsonSchema, + }, +} +``` + +## Function Calling + +| Parameter | Type | Description | +|-----------|------|-------------| +| `tools` | array | Tool definitions the model can call | +| `tool_choice` | string/object | `"auto"`, `"required"`, `"none"`, or specific function | + +```python +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City name"}, + }, + "required": ["location"], + }, + }, +}] + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=[{"role": "user", "content": "Weather in NYC?"}], + tools=tools, + tool_choice="auto", +) +``` + +```typescript +const response = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages: [{ role: "user", content: "Weather in NYC?" }], + tools: [ + { + type: "function", + function: { + name: "getWeather", + description: "Get weather for a location", + parameters: { + type: "object", + properties: { + location: { type: "string", description: "City name" }, + }, + required: ["location"], + }, + }, + }, + ], + tool_choice: "auto", +}); +``` + +## Safety & Compliance + +| Parameter | Type | Description | +|-----------|------|-------------| +| `safety_model` | string | Moderation model (e.g., `meta-llama/Llama-Guard-4-12B`) | +| `compliance` | string | Set to `"hipaa"` for HIPAA mode | + +## Reasoning + +| Parameter | Type | Values | Description | +|-----------|------|--------|-------------| +| `reasoning_effort` | string | `"low"`, `"medium"`, `"high"` | Control reasoning depth (GPT-OSS only) | +| `reasoning` | object | `{"enabled": true/false}` | Toggle reasoning for hybrid models | +| `chat_template_kwargs` | object | `{"thinking": true}` | Alternative reasoning toggle | + +### Reasoning Effort (Python) + +```python +stream = client.chat.completions.create( + model="openai/gpt-oss-120b", + messages=[{"role": "user", "content": "Prove the infinitude of primes"}], + temperature=1.0, + top_p=1.0, + reasoning_effort="medium", + stream=True, +) + +for chunk in stream: + print(chunk.choices[0].delta.content or "", end="", flush=True) +``` + +### Reasoning Effort (TypeScript) + +```typescript +const stream = await together.chat.completions.create({ + model: "openai/gpt-oss-120b", + messages: [{ role: "user", content: "Prove the infinitude of primes" }], + temperature: 1.0, + top_p: 1.0, + reasoning_effort: "medium", + stream: true, +}); + +for await (const chunk of stream) { + process.stdout.write(chunk.choices[0]?.delta?.content || ""); +} +``` + +### Enabling/Disabling Reasoning (Python) + +```python +stream = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[{"role": "user", "content": "Which is bigger, 9.11 or 9.9?"}], + reasoning={"enabled": True}, + temperature=1.0, + stream=True, +) + +for chunk in stream: + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning") and delta.reasoning: + print(delta.reasoning, end="", flush=True) + if hasattr(delta, "content") and delta.content: + print(delta.content, end="", flush=True) +``` + +### Enabling/Disabling Reasoning (TypeScript) + +```typescript +import type { ChatCompletionChunk } from "together-ai/resources/chat/completions"; + +type ReasoningDelta = ChatCompletionChunk.Choice.Delta & { + reasoning?: string; +}; + +const stream = await together.chat.completions.create({ + model: "moonshotai/Kimi-K2.6", + messages: [{ role: "user", content: "Which is bigger, 9.11 or 9.9?" }], + reasoning: { enabled: true }, + temperature: 1.0, + stream: true, +} as any); + +for await (const chunk of stream) { + const delta = chunk.choices[0]?.delta as ReasoningDelta; + if (delta?.reasoning) process.stdout.write(delta.reasoning); + if (delta?.content) process.stdout.write(delta.content); +} +``` + +## Context Handling + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `context_length_exceeded_behavior` | string | `"error"` | `"truncate"` or `"error"` when exceeding context | + +## Message Object + +```python +{"role": "system", "content": "You are a helpful assistant."} +{"role": "user", "content": "Hello!"} +{"role": "assistant", "content": "Hi there!"} +{"role": "tool", "tool_call_id": "...", "content": "..."} +``` + +Multimodal content (vision models): + +```python +{"role": "user", "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://..."}}, + {"type": "video_url", "video_url": {"url": "https://..."}}, + {"type": "audio_url", "audio_url": {"url": "https://..."}}, +]} +``` + +## OpenAI Compatibility + +Migrate from the OpenAI SDK by changing the base URL and API key while keeping the chat/completions +shape the same: + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://api.together.xyz/v1", + api_key="YOUR_TOGETHER_API_KEY", +) +response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": "Hello!"}], +) +print(response.choices[0].message.content) +``` + +Use the native Together SDK when you need Together-only helpers such as `.with_raw_response`, +typed reasoning fields, or other platform-specific workflows already covered by this skill. + +## Rate Limits & Build Tiers + +Serverless rate limits are model-specific and can change based on capacity. Treat response headers +as the source of truth for the current limit on the model you are calling. + +Current account-level LLM baselines from the official Together AI billing docs: + +| Build Tier | Lifetime Spend | LLM Rate Limit | +|------------|----------------|----------------| +| Build Tier 1 | $5 | 600 RPM | +| Build Tier 2 | $50 | 1,800 RPM | +| Build Tier 3 | $100 | 3,000 RPM | +| Build Tier 4 | $250 | 4,500 RPM | +| Build Tier 5 | $1,000 | 6,000 RPM | + +Rate-limit headers returned on serverless responses: + +| Header | Description | +|--------|-------------| +| `x-ratelimit-limit` | Maximum request rate currently allowed | +| `x-ratelimit-remaining` | Remaining request capacity in the current window | +| `x-ratelimit-reset` | Time until the request window resets | +| `x-tokenlimit-limit` | Maximum token rate currently allowed | +| `x-tokenlimit-remaining` | Remaining token capacity in the current window | +| `x-ratelimit-limit-dynamic` | Dynamic request-rate allowance when enabled | +| `x-ratelimit-remaining-dynamic` | Remaining dynamic request capacity | +| `x-tokenlimit-limit-dynamic` | Dynamic token-rate allowance when enabled | +| `x-tokenlimit-remaining-dynamic` | Remaining dynamic token capacity | + +Best practices: + +- plan against the latest headers instead of a hard-coded RPM table +- keep traffic steady instead of bursty +- use batch inference for high-volume offline jobs +- use dedicated endpoints for strict capacity or SLA requirements + +## Debug Mode + +Send the `x-together-debug: 1` header to get detailed routing and latency headers. Use +`.with_raw_response` in Python or `.asResponse()` in TypeScript when you need both the parsed body +and the raw headers. + +```python +from together import Together +import json + +client = Together() + +response = client.chat.completions.with_raw_response.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": "Say hello"}], + extra_headers={"x-together-debug": "1"}, +) + +parsed = response.parse() +print(parsed.choices[0].message.content) + +headers = dict(response.headers) +print(json.dumps(headers, indent=2)) +``` + +```typescript +import Together from "together-ai"; + +const client = new Together(); + +const response = await client.chat.completions.create( + { + model: "openai/gpt-oss-20b", + messages: [{ role: "user", content: "Say hello" }], + }, + { headers: { "x-together-debug": "1" } } +).asResponse(); + +const parsed = await response.json(); +console.log(parsed.choices[0].message.content); + +for (const [key, value] of response.headers.entries()) { + if (key.startsWith("x-")) console.log(`${key}: ${value}`); +} +``` + +```shell +curl -s -D - -X POST "https://api.together.xyz/v1/chat/completions" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -H "x-together-debug: 1" \ + -d '{"model":"openai/gpt-oss-20b","messages":[{"role":"user","content":"Say hello"}]}' +``` + +Common debug headers: + +| Header | Description | +|--------|-------------| +| `x-request-id` | Unique request ID for support tickets | +| `x-together-traceid` | Distributed trace ID for internal routing | +| `x-cluster` | Inference cluster that served the request | +| `x-engine-pod` | Engine pod that processed the request | +| `x-api-received` | Timestamp when the API received the request | +| `x-api-call-start` | Timestamp when inference started | +| `x-api-call-end` | Timestamp when inference completed | +| `x-inference-version` | Inference engine version | + +## HTTP Status Codes + +| Code | Description | +|------|-------------| +| 200 | Success | +| 400 | Bad request (invalid params) | +| 401 | Unauthorized (invalid API key) | +| 402 | Payment required (spending limit reached) | +| 403 | Input token count + max_tokens exceeds model context length | +| 404 | Model not found | +| 429 | Rate limit exceeded | +| 500 | Server error | +| 503 | Service overloaded | +| 504 | Request timeout | diff --git a/plugins/togetherai/skills/together-chat-completions/references/function-calling-patterns.md b/plugins/togetherai/skills/together-chat-completions/references/function-calling-patterns.md new file mode 100644 index 00000000..78e094db --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/references/function-calling-patterns.md @@ -0,0 +1,826 @@ +# Function Calling Patterns Reference +## Contents + +- [6 Calling Patterns](#6-calling-patterns) +- [Combining Tool Calls with Structured Output](#combining-tool-calls-with-structured-output) +- [Processing Tool Calls](#processing-tool-calls) +- [tool_choice Parameter](#toolchoice-parameter) +- [Best Practices](#best-practices) +- [Supported Models](#supported-models) + + +## 6 Calling Patterns + +### 1. Simple -- Single function, single call + +Model picks one function and calls it once. + +```python +import json +from together import Together + +client = Together() + +tools = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + }, + }, +}] + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant that can access external functions."}, + {"role": "user", "content": "What is the current temperature of New York?"}, + ], + tools=tools, +) + +tool_call = response.choices[0].message.tool_calls[0] +print(f"Function: {tool_call.function.name}") +print(f"Arguments: {tool_call.function.arguments}") +``` + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const response = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages: [ + { + role: "system", + content: "You are a helpful assistant that can access external functions.", + }, + { role: "user", content: "What is the current temperature of New York?" }, + ], + tools: [ + { + type: "function", + function: { + name: "getCurrentWeather", + description: "Get the current weather in a given location", + parameters: { + type: "object", + properties: { + location: { + type: "string", + description: "The city and state, e.g. San Francisco, CA", + }, + unit: { + type: "string", + description: "The unit of temperature", + enum: ["celsius", "fahrenheit"], + }, + }, + }, + }, + }, + ], +}); + +console.log(JSON.stringify(response.choices[0].message?.tool_calls, null, 2)); +``` + +### 2. Multiple Functions -- Model picks which to call + +Multiple tools available, model chooses the right one based on user intent. + +```python +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_current_stock_price", + "description": "Get the current stock price for a given stock symbol", + "parameters": { + "type": "object", + "properties": { + "symbol": {"type": "string", "description": "The stock symbol, e.g. AAPL"}, + "exchange": { + "type": "string", + "description": "The stock exchange (optional)", + "enum": ["NYSE", "NASDAQ", "LSE", "TSX"], + }, + }, + "required": ["symbol"], + }, + }, + }, +] + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=[{"role": "user", "content": "What's the current price of Apple's stock?"}], + tools=tools, +) +# Model correctly picks get_current_stock_price(symbol="AAPL") +``` + +```typescript +const tools = [ + { + type: "function" as const, + function: { + name: "getCurrentWeather", + description: "Get the current weather in a given location", + parameters: { + type: "object", + properties: { + location: { + type: "string", + description: "The city and state, e.g. San Francisco, CA", + }, + unit: { + type: "string", + description: "The unit of temperature", + enum: ["celsius", "fahrenheit"], + }, + }, + }, + }, + }, + { + type: "function" as const, + function: { + name: "getCurrentStockPrice", + description: "Get the current stock price for a given stock symbol", + parameters: { + type: "object", + properties: { + symbol: { + type: "string", + description: "The stock symbol, e.g. AAPL, GOOGL, TSLA", + }, + exchange: { + type: "string", + description: "The stock exchange (optional)", + enum: ["NYSE", "NASDAQ", "LSE", "TSX"], + }, + }, + required: ["symbol"], + }, + }, + }, +]; + +const response = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages: [ + { role: "user", content: "What's the current price of Apple's stock?" }, + ], + tools, +}); + +// Model correctly picks getCurrentStockPrice(symbol="AAPL") +console.log(JSON.stringify(response.choices[0].message?.tool_calls, null, 2)); +``` + +### 3. Parallel -- Same function, multiple calls + +Model calls the same function multiple times in one turn. + +```python +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant that can access external functions."}, + { + "role": "user", + "content": "What is the current temperature of New York, San Francisco and Chicago?", + }, + ], + tools=[{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + }, + }, + }], +) + +# Model returns 3 tool_calls: +# get_current_weather(location="New York, NY", unit="fahrenheit") +# get_current_weather(location="San Francisco, CA", unit="fahrenheit") +# get_current_weather(location="Chicago, IL", unit="fahrenheit") +for tc in response.choices[0].message.tool_calls: + print(f" {tc.function.name}({tc.function.arguments})") +``` + +```typescript +const response = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages: [ + { + role: "system", + content: "You are a helpful assistant that can access external functions.", + }, + { + role: "user", + content: + "What is the current temperature of New York, San Francisco and Chicago?", + }, + ], + tools: [ + { + type: "function", + function: { + name: "getCurrentWeather", + description: "Get the current weather in a given location", + parameters: { + type: "object", + properties: { + location: { + type: "string", + description: "The city and state, e.g. San Francisco, CA", + }, + unit: { + type: "string", + description: "The unit of temperature", + enum: ["celsius", "fahrenheit"], + }, + }, + }, + }, + }, + ], +}); + +// Model returns 3 tool_calls for NYC, SF, and Chicago +console.log(JSON.stringify(response.choices[0].message?.tool_calls, null, 2)); +``` + +### 4. Parallel Multiple -- Different functions in one turn + +Model calls multiple different functions simultaneously. Combines parallel and multiple function +calling: one user prompt triggers multiple different function calls. + +```python +# User: "What's Apple and Google's stock price, and what's the weather in NYC, SF, and Chicago?" +# Model returns 5 tool_calls: +# get_current_stock_price(symbol="AAPL") +# get_current_stock_price(symbol="GOOGL") +# get_current_weather(location="New York, NY") +# get_current_weather(location="San Francisco, CA") +# get_current_weather(location="Chicago, IL") + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=[ + { + "role": "user", + "content": ( + "What's Apple and Google's stock price, and what's the weather " + "in New York, San Francisco, and Chicago?" + ), + }, + ], + tools=tools, # both weather and stock tools defined +) + +for tc in response.choices[0].message.tool_calls: + print(f" {tc.function.name}({tc.function.arguments})") +``` + +```typescript +const response = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages: [ + { + role: "user", + content: + "What's Apple and Google's stock price, and what's the weather in NYC, SF, and Chicago?", + }, + ], + tools, // both weather and stock tools defined +}); + +// Returns 5 tool_calls: 2 stock + 3 weather +for (const tc of response.choices[0].message?.tool_calls ?? []) { + console.log(` ${tc.function.name}(${tc.function.arguments})`); +} +``` + +### 5. Multi-step -- Chained function calls + +Sequential function calls within one conversation turn. Functions are called, results are processed, +then used to inform the final response. + +```python +import json +from together import Together + +client = Together() + +messages = [ + {"role": "system", "content": "You are a helpful assistant that can access external functions."}, + { + "role": "user", + "content": "What is the current temperature of New York, San Francisco and Chicago?", + }, +] + +# Step 1: Model generates tool calls +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=messages, + tools=tools, +) + +# Step 2: Execute functions and add results +messages.append(response.choices[0].message) +for tc in response.choices[0].message.tool_calls: + args = json.loads(tc.function.arguments) + result = get_current_weather(args) # your function + messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": json.dumps(result), + }) + +# Step 3: Model produces final answer using all results +final = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=messages, + tools=tools, +) +print(final.choices[0].message.content) +# "The current temperature in New York is 11F, in San Francisco it is 55F, ..." +``` + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const messages: any[] = [ + { + role: "system", + content: "You are a helpful assistant that can access external functions.", + }, + { + role: "user", + content: + "What is the current temperature of New York, San Francisco and Chicago?", + }, +]; + +// Step 1: Model generates tool calls +const response = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages, + tools, +}); + +// Step 2: Execute functions and add results +messages.push(response.choices[0].message); +for (const tc of response.choices[0].message?.tool_calls ?? []) { + const args = JSON.parse(tc.function.arguments); + const result = getCurrentWeather(args); // your function + messages.push({ + role: "tool", + tool_call_id: tc.id, + content: JSON.stringify(result), + }); +} + +// Step 3: Model produces final answer +const final = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages, + tools, +}); +console.log(final.choices[0].message.content); +``` + +### 6. Multi-turn -- Function calls across conversation turns + +Context is maintained across multiple conversation turns and functions can be called at any point. +Previous function results inform future decisions, enabling truly agentic behavior. + +```python +messages = [ + {"role": "system", "content": "You are a travel planning assistant."}, +] + +# Turn 1: User asks about weather in 3 cities +messages.append({ + "role": "user", + "content": "What's the weather in NYC, SF, and Chicago?", +}) + +response = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=messages, + tools=tools, +) + +# Execute weather calls, add results to messages... +messages.append(response.choices[0].message) +for tc in response.choices[0].message.tool_calls: + args = json.loads(tc.function.arguments) + result = get_current_weather(args) + messages.append({"role": "tool", "tool_call_id": tc.id, "content": json.dumps(result)}) + +final = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=messages, + tools=tools, +) +messages.append(final.choices[0].message) + +# Turn 2: User follows up -- model uses previous context +messages.append({ + "role": "user", + "content": "Which city has the best weather for outdoor dining? Find me a restaurant there.", +}) + +response2 = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=messages, + tools=tools, +) +# Model remembers SF had 65F, picks it, and calls get_restaurant(location="San Francisco") +``` + +```typescript +const messages: any[] = [ + { role: "system", content: "You are a travel planning assistant." }, +]; + +// Turn 1: User asks about weather +messages.push({ + role: "user", + content: "What's the weather in NYC, SF, and Chicago?", +}); + +const response = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages, + tools, +}); + +// Execute weather calls, add results... +messages.push(response.choices[0].message); +for (const tc of response.choices[0].message?.tool_calls ?? []) { + const args = JSON.parse(tc.function.arguments); + const result = getCurrentWeather(args); + messages.push({ + role: "tool", + tool_call_id: tc.id, + content: JSON.stringify(result), + }); +} + +const final = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages, + tools, +}); +messages.push(final.choices[0].message); + +// Turn 2: Model uses previous weather data to recommend +messages.push({ + role: "user", + content: + "Which city has the best weather for outdoor dining? Find me a restaurant there.", +}); + +const response2 = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages, + tools, +}); +// Model picks the best-weather city and calls get_restaurant +``` + +## Combining Tool Calls with Structured Output + +You cannot pass `tools` and `response_format` in the same request. Use a two-phase approach: + +1. Phase 1 (tool detection): Send with `tools`, no `response_format`. Model decides whether to call + functions. +2. Phase 2 (structured response): After executing tools and appending results, send a follow-up + request with `response_format` (and optionally `stream=True`) but without `tools`. + +```python +import json +from together import Together +from pydantic import BaseModel, Field + +client = Together() + +class ChatResponse(BaseModel): + response: str = Field(description="The assistant's answer") + confidence: float = Field(description="Confidence from 0.0 to 1.0") + sources: list[str] = Field(description="Data sources used") + +messages = [ + {"role": "system", "content": "You are a helpful assistant with weather tools."}, + {"role": "user", "content": "What's the weather in NYC?"}, +] + +# Phase 1: tool detection (no response_format) +response = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages=messages, + tools=tools, +) + +# Execute tool calls and append results +tool_calls = response.choices[0].message.tool_calls +if tool_calls: + messages.append(response.choices[0].message) + for tc in tool_calls: + result = execute_function(tc.function.name, json.loads(tc.function.arguments)) + messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": json.dumps(result), + }) + +# Phase 2: structured response (no tools) +schema = ChatResponse.model_json_schema() +final = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages=messages, + response_format={ + "type": "json_schema", + "json_schema": {"name": "chat_response", "schema": schema}, + }, + stream=True, # can stream structured JSON +) + +chunks = [] +for chunk in final: + token = chunk.choices[0].delta.content or "" + chunks.append(token) + print(token, end="", flush=True) +output = json.loads("".join(chunks)) +``` + +```typescript +import Together from "together-ai"; +import { z } from "zod"; + +const together = new Together(); + +const chatResponseSchema = z.object({ + response: z.string().describe("The assistant's answer"), + confidence: z.number().describe("Confidence from 0.0 to 1.0"), + sources: z.array(z.string()).describe("Data sources used"), +}); +const jsonSchema = z.toJSONSchema(chatResponseSchema); + +// Phase 1: tool detection +const response = await together.chat.completions.create({ + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages, + tools, +}); + +// Execute tool calls, append results to messages... + +// Phase 2: structured response (no tools), with streaming +const stream = await together.chat.completions.create({ + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages, + response_format: { + type: "json_schema", + json_schema: { name: "chat_response", schema: jsonSchema }, + }, + stream: true, +}); + +const chunks: string[] = []; +for await (const chunk of stream) { + const token = chunk.choices[0]?.delta?.content || ""; + chunks.push(token); + process.stdout.write(token); +} +const output = JSON.parse(chunks.join("")); +``` + +## Processing Tool Calls + +### Python + +```python +import json + +# 1. Get tool calls from response +tool_calls = response.choices[0].message.tool_calls + +# 2. Add assistant message to history +messages.append(response.choices[0].message) + +# 3. Execute each function and add results +for tc in tool_calls: + args = json.loads(tc.function.arguments) + result = execute_function(tc.function.name, args) + messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": json.dumps(result), + }) + +# 4. Get final response +final = client.chat.completions.create( + model="Qwen/Qwen2.5-7B-Instruct-Turbo", + messages=messages, + tools=tools, +) +``` + +### TypeScript + +```typescript +// 1. Get tool calls from response +const toolCalls = response.choices[0].message?.tool_calls ?? []; + +// 2. Add assistant message to history +messages.push(response.choices[0].message); + +// 3. Execute each function and add results +for (const tc of toolCalls) { + const args = JSON.parse(tc.function.arguments); + const result = executeFunction(tc.function.name, args); + messages.push({ + role: "tool", + tool_call_id: tc.id, + content: JSON.stringify(result), + }); +} + +// 4. Get final response +const final = await together.chat.completions.create({ + model: "Qwen/Qwen2.5-7B-Instruct-Turbo", + messages, + tools, +}); +``` + +## tool_choice Parameter + +| Value | Behavior | +|-------|----------| +| `"auto"` (default) | Model decides whether to call functions | +| `"required"` | Model must call at least one function | +| `"none"` | Never call functions | +| `{"type": "function", "function": {"name": "fn"}}` | Force specific function | + +## Best Practices + +The quality of tool definitions, system prompt, and selection controls determines how reliably the +model calls functions. Apply these rules when building or debugging a tool-calling app. + +### Write tight function descriptions + +The description is the only context the model has for deciding when to call a tool and how to fill +its arguments. Treat it as a short spec: + +- State what the tool does and when to use it (and when not to). +- Describe each parameter's meaning, expected format, and effect on the result. +- Note caveats: what the tool does not return, edge cases. +- Describe what the output represents. + +Aim for three to four sentences per tool. If a new engineer could correctly call the function from +the schema alone, the model can too. Fold concrete examples into the description prose or the +system prompt - the OpenAI-compatible tool schema (`type`, `function.name`, +`function.description`, `function.parameters`) has no separate examples field. + +### Make invalid states unrepresentable + +Constrain what the model can produce via JSON Schema rather than validating after the fact: + +- Give every parameter a type. Use `enum` whenever the valid values are a fixed set. +- List the parameters the model must supply in `required`. Leave optional ones out. +- Replace contradictory flag pairs (e.g. `on: bool, off: bool`) with a single `enum` field + (`state: ["on", "off"]`). +- Set `"additionalProperties": false` on the parameters object so the model cannot add unknown + fields. +- For stricter conformance, add `"strict": true` to the function definition. Together's API accepts + it and constrains generated arguments to match your schema: + +```python +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current temperature for a location.", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g. San Francisco, CA"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + "additionalProperties": False, + }, + "strict": True, + }, + } +] +``` + +### Keep the active tool set small + +More tools means more chances to pick the wrong one. Aim for fewer than 20 active tools in `tools` +per request, and evaluate as the set grows. + +- Consolidate related operations: prefer one `manage_ticket` with an `action` enum over separate + `create_ticket` / `update_ticket` / `close_ticket` tools. +- Namespace tool names across services: `github_list_prs`, `slack_send_message`. +- Scope tools per turn: pass only the subset relevant to the current conversation, not the whole + catalog. +- Tool names must not contain spaces, periods, or dashes (e.g. `get_current_weather`, not + `get current weather` or `get-current-weather`). + +### Offload work from the model to your code + +Don't ask the model to produce information the application already has: + +- Drop arguments you already know. If `order_id` is held in app state, expose `submit_refund()` + with no arguments and inject the id in your code when executing the call. +- Combine always-sequential calls into one tool. One round trip is more reliable than two. + +### Guide the model with the system prompt + +- Give the model a role: `You are a travel planning assistant with access to weather and + restaurant tools.` +- State when to use each tool, and when not to. +- Forbid guessing: `Do not guess values. If a required detail is missing, ask the user for it + before calling a tool.` +- Encourage clarification when the request is ambiguous. + +### Handle responses and errors robustly + +Tool calls come back in `message.tool_calls`, not `message.content` (which is often `null` on a +tool-calling turn). Build the loop defensively: + +- Check `finish_reason`: `"tool_calls"` means run a tool; `"stop"` means a normal text reply. + Branch on it instead of assuming a tool was called. +- Parse `function.arguments` as JSON inside a try/except - handle malformed or incomplete JSON. +- On tool failure, return a clear error payload in the `tool` message content (for example + `{"error": "No stock found for symbol XYZ"}`) so the model can recover, rather than throwing. +- Validate high-consequence calls (orders, refunds, deletes) with the user before executing. +- Validate and sanitize arguments before acting on them; keep secrets out of tool arguments. + +### Tune for reliable calls + +- Lower the temperature (e.g. `0`) to make tool selection and argument generation more + deterministic. Raise it only when more varied behavior is needed. +- Stream when latency matters: tool calls stream incrementally through `delta.tool_calls`. +- Watch the token budget: tool descriptions and schemas count toward input tokens. Tighten or + split the tool set if you approach the limit. + +### When to fine-tune + +Strong descriptions and a focused tool set cover most cases. For higher accuracy across a large +tool catalog or a difficult domain-specific task, fine-tune a model on your own tool-calling data. +See the `together-fine-tuning` skill for the function-calling dataset format and training +workflow. + +## Supported Models + +openai/gpt-oss-120b, openai/gpt-oss-20b, moonshotai/Kimi-K2.6, +zai-org/GLM-5.1, zai-org/GLM-5, MiniMaxAI/MiniMax-M2.7, Qwen/Qwen3.5-397B-A17B, +Qwen/Qwen3.5-9B, Qwen/Qwen3.6-Plus, +Qwen/Qwen3-235B-A22B-Instruct-2507-tput, deepseek-ai/DeepSeek-V4-Pro, +meta-llama/Llama-3.3-70B-Instruct-Turbo, Qwen/Qwen2.5-7B-Instruct-Turbo, google/gemma-4-31B-it diff --git a/plugins/togetherai/skills/together-chat-completions/references/models.md b/plugins/togetherai/skills/together-chat-completions/references/models.md new file mode 100644 index 00000000..5e782e21 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/references/models.md @@ -0,0 +1,58 @@ +# Chat Model Catalog + +## Recommended Models by Use Case + +| Use Case | Model | API String | Alternatives | +|----------|-------|-----------|-------------| +| Chat (best) | Kimi K2.6 | `moonshotai/Kimi-K2.6` | `MiniMaxAI/MiniMax-M2.7`, `openai/gpt-oss-120b` | +| Reasoning | DeepSeek-V4-Pro | `deepseek-ai/DeepSeek-V4-Pro` | `moonshotai/Kimi-K2.6`, `Qwen/Qwen3.6-Plus` | +| Coding Agents | GLM-5.1 | `zai-org/GLM-5.1` | `moonshotai/Kimi-K2.6`, `deepseek-ai/DeepSeek-V4-Pro`, `MiniMaxAI/MiniMax-M2.7` | +| Small & Fast | GPT-OSS 20B | `openai/gpt-oss-20b` | `Qwen/Qwen2.5-7B-Instruct-Turbo`, `google/gemma-3n-E4B-it` | +| Medium General | GPT-OSS 120B | `openai/gpt-oss-120b` | `zai-org/GLM-5` | +| Function Calling | GLM-5.1 | `zai-org/GLM-5.1` | `moonshotai/Kimi-K2.6`, `MiniMaxAI/MiniMax-M2.7` | +| Vision | Qwen3.5 397B | `Qwen/Qwen3.5-397B-A17B` | `Qwen/Qwen3.5-9B`, `google/gemma-4-31B-it` | + +## Full Chat Model Catalog + +| Organization | Model | API String | Context | Quant | +|-------------|-------|-----------|---------|-------| +| MiniMax | MiniMax M2.7 | `MiniMaxAI/MiniMax-M2.7` | 202,752 | FP4 | +| Qwen | Qwen3.7 Max | `Qwen/Qwen3.7-Max` | - | - | +| Qwen | Qwen3.5 397B A17B | `Qwen/Qwen3.5-397B-A17B` | 262,144 | BF16 | +| Qwen | Qwen3.6 Plus | `Qwen/Qwen3.6-Plus` | 1,000,000 | - | +| Qwen | Qwen3.5 9B | `Qwen/Qwen3.5-9B` | 262,144 | FP8 | +| Qwen | Qwen3 235B Instruct | `Qwen/Qwen3-235B-A22B-Instruct-2507-tput` | 262,144 | FP8 | +| Moonshot | Kimi K2.6 | `moonshotai/Kimi-K2.6` | 262,144 | FP4 | +| DeepSeek | DeepSeek-V4-Pro | `deepseek-ai/DeepSeek-V4-Pro` | 512,000 | FP4 | +| NVIDIA | Nemotron 3 Ultra 550B A55B | `nvidia/nemotron-3-ultra-550b-a55b` | 512,300 | NVFP4 | +| OpenAI | GPT-OSS 120B | `openai/gpt-oss-120b` | 128,000 | MXFP4 | +| OpenAI | GPT-OSS 20B | `openai/gpt-oss-20b` | 128,000 | MXFP4 | +| Z.ai | GLM-5.1 | `zai-org/GLM-5.1` | 202,752 | FP4 | +| Z.ai | GLM-5 | `zai-org/GLM-5` | 202,752 | FP4 | +| Meta | Llama 3.3 70B Turbo | `meta-llama/Llama-3.3-70B-Instruct-Turbo` | 131,072 | FP8 | +| Meta | Llama 3 8B Lite | `meta-llama/Meta-Llama-3-8B-Instruct-Lite` | 8,192 | - | +| Deep Cogito | Cogito v2.1 671B | `deepcogito/cogito-v2-1-671b` | 163,840 | - | +| Google | Gemma 4 31B IT | `google/gemma-4-31B-it` | 262,144 | FP8 | +| Google | Gemma 3N E4B | `google/gemma-3n-E4B-it` | 32,768 | FP8 | +| Liquid AI | LFM2-24B-A2B | `LiquidAI/LFM2-24B-A2B` | 32,768 | - | +| Qwen | Qwen 2.5 7B Turbo | `Qwen/Qwen2.5-7B-Instruct-Turbo` | 32,768 | FP8 | +| Essential AI | Rnj-1 Instruct | `essentialai/rnj-1-instruct` | 32,768 | BF16 | + +## Vision Models + +| Organization | Model | API String | Context | +|-------------|-------|-----------|---------| +| Qwen | Qwen3.5 397B A17B | `Qwen/Qwen3.5-397B-A17B` | 262,144 | +| Qwen | Qwen3.5 9B | `Qwen/Qwen3.5-9B` | 262,144 | +| Google | Gemma 4 31B IT | `google/gemma-4-31B-it` | 262,144 | + +## Moderation Models + +| Model | API String | Context | +|-------|-----------|---------| +| Llama Guard 4 (12B) | `meta-llama/Llama-Guard-4-12B` | 1,048,576 | + +## Quantization Types +- FP16/BF16: Full precision +- FP8: 8-bit floating point (Turbo models) +- FP4/MXFP4: 4-bit floating point diff --git a/plugins/togetherai/skills/together-chat-completions/references/reasoning-models.md b/plugins/togetherai/skills/together-chat-completions/references/reasoning-models.md new file mode 100644 index 00000000..61e4a018 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/references/reasoning-models.md @@ -0,0 +1,423 @@ +# Reasoning Models Reference +## Contents + +- [Full Model Table](#full-model-table) +- [Reasoning Effort Levels](#reasoning-effort-levels) +- [Enabling and Disabling Reasoning (Hybrid Models)](#enabling-and-disabling-reasoning) +- [Controlling Reasoning Depth via Prompting](#controlling-reasoning-depth-via-prompting) +- [Reasoning Output Format](#reasoning-output-format) +- [Structured Outputs with Reasoning](#structured-outputs-with-reasoning) +- [Best Practices by Model](#best-practices-by-model) + + +## Full Model Table + +| Model | API String | Type | Context | Tool Calling | +|-------|-----------|------|---------|--------------| +| DeepSeek-V4-Pro | `deepseek-ai/DeepSeek-V4-Pro` | Hybrid (on by default) | 512K | Yes | +| GLM-5.1 | `zai-org/GLM-5.1` | Hybrid (on by default) | 200K | Yes | +| GLM-5 | `zai-org/GLM-5` | Hybrid (on by default) | 200K | Yes | +| GPT-OSS 120B | `openai/gpt-oss-120b` | Adjustable effort | 128K | No | +| GPT-OSS 20B | `openai/gpt-oss-20b` | Adjustable effort | 128K | No | +| Kimi K2.6 | `moonshotai/Kimi-K2.6` | Hybrid (on by default) | 262K | Yes | +| MiniMax M2.7 | `MiniMaxAI/MiniMax-M2.7` | Reasoning only | 202K | Yes | +| Nemotron 3 Ultra 550B A55B | `nvidia/nemotron-3-ultra-550b-a55b` | Hybrid (on by default) | 512K | Yes | +| Qwen3.5 397B | `Qwen/Qwen3.5-397B-A17B` | Hybrid (on by default) | 262K | Yes | +| Qwen3.5 9B | `Qwen/Qwen3.5-9B` | Hybrid (on by default) | 262K | Yes | +| Qwen3.6 Plus | `Qwen/Qwen3.6-Plus` | Hybrid (on by default) | 1M | Yes | + +Type definitions: +- Reasoning only: Always produces reasoning tokens. Cannot be toggled off. +- Hybrid: Supports both reasoning and non-reasoning modes via `reasoning={"enabled": True/False}`. +- Adjustable effort: Supports `reasoning_effort` parameter (`"low"`, `"medium"`, `"high"`). + +## Reasoning Effort Levels + +GPT-OSS models support `reasoning_effort` to control reasoning depth: + +| Level | Behavior | Best For | +|-------|----------|----------| +| `"low"` | Minimal thinking, fast | Simple factual questions | +| `"medium"` | Balanced (recommended default) | Most tasks | +| `"high"` | Extensive thinking, thorough | Complex math, code, logic proofs | + +### Python + +```python +from together import Together + +client = Together() + +stream = client.chat.completions.create( + model="openai/gpt-oss-120b", + messages=[{"role": "user", "content": "Prove the infinitude of primes"}], + temperature=1.0, + top_p=1.0, + reasoning_effort="high", + stream=True, +) + +for chunk in stream: + print(chunk.choices[0].delta.content or "", end="", flush=True) +``` + +### TypeScript + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const stream = await together.chat.completions.create({ + model: "openai/gpt-oss-120b", + messages: [{ role: "user", content: "Prove the infinitude of primes" }], + temperature: 1.0, + top_p: 1.0, + reasoning_effort: "high", + stream: true, +}); + +for await (const chunk of stream) { + process.stdout.write(chunk.choices[0]?.delta?.content || ""); +} +``` + +### cURL + +```shell +curl -X POST "https://api.together.xyz/v1/chat/completions" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "openai/gpt-oss-120b", + "messages": [ + {"role": "user", "content": "Prove the infinitude of primes"} + ], + "temperature": 1.0, + "reasoning_effort": "high" + }' +``` + +## Enabling and Disabling Reasoning (Hybrid Models) + +Hybrid models support `reasoning={"enabled": True/False}` to toggle reasoning on or off. + +Models supporting this parameter: +- `deepseek-ai/DeepSeek-V4-Pro` (on by default) +- `Qwen/Qwen3.5-397B-A17B` (on by default) +- `Qwen/Qwen3.5-9B` (on by default) +- `Qwen/Qwen3.6-Plus` (on by default) +- `moonshotai/Kimi-K2.6` (on by default) +- `nvidia/nemotron-3-ultra-550b-a55b` (on by default) +- `zai-org/GLM-5.1` (on by default) +- `zai-org/GLM-5` (on by default) + +### Python -- Enable Reasoning + +```python +from together import Together + +client = Together() + +stream = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[ + {"role": "user", "content": "Which number is bigger, 9.11 or 9.9? Think carefully."}, + ], + reasoning={"enabled": True}, + temperature=1.0, + top_p=0.95, + stream=True, +) + +for chunk in stream: + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning") and delta.reasoning: + print(delta.reasoning, end="", flush=True) + if hasattr(delta, "content") and delta.content: + print(delta.content, end="", flush=True) +``` + +### TypeScript -- Enable Reasoning + +```typescript +import Together from "together-ai"; +import type { + ChatCompletionChunk, + CompletionCreateParamsStreaming, +} from "together-ai/resources/chat/completions"; + +const together = new Together(); + +type ReasoningParams = CompletionCreateParamsStreaming & { + reasoning?: { enabled: boolean }; +}; + +type ReasoningDelta = ChatCompletionChunk.Choice.Delta & { + reasoning?: string; +}; + +const params: ReasoningParams = { + model: "moonshotai/Kimi-K2.6", + messages: [ + { + role: "user", + content: "Which number is bigger, 9.11 or 9.9? Think carefully.", + }, + ], + reasoning: { enabled: true }, + temperature: 1.0, + top_p: 0.95, + stream: true, +}; + +const stream = await together.chat.completions.create(params); + +for await (const chunk of stream) { + const delta = chunk.choices[0]?.delta as ReasoningDelta; + if (delta?.reasoning) process.stdout.write(delta.reasoning); + if (delta?.content) process.stdout.write(delta.content); +} +``` + +### Python -- Disable Reasoning (Instant Mode) + +```python +response = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning={"enabled": False}, + temperature=0.6, +) +print(response.choices[0].message.content) +``` + +### Alternative: chat_template_kwargs + +```python +response = client.chat.completions.create( + model="Qwen/Qwen3.5-397B-A17B", + messages=[{"role": "user", "content": "Prove that sqrt(2) is irrational."}], + chat_template_kwargs={"thinking": True}, + stream=True, +) +``` + +## Controlling Reasoning Depth via Prompting + +For hybrid models without `reasoning_effort`, influence thinking depth through the +prompt: + +```python +# Ask for concise reasoning +response = client.chat.completions.create( + model="deepseek-ai/DeepSeek-V4-Pro", + messages=[ + { + "role": "user", + "content": "Please think briefly.\n\nWhat is 15% of 240?", + } + ], + stream=True, +) + +# Or suggest a reasoning budget +response = client.chat.completions.create( + model="deepseek-ai/DeepSeek-V4-Pro", + messages=[ + { + "role": "user", + "content": ( + "Please use around 1000 words to think, " + "but do not literally count each one.\n\n" + "Explain why quicksort has O(n log n) average-case complexity." + ), + } + ], + stream=True, +) +``` + +## Reasoning Output Format + +### Separate reasoning field (most models) + +Models like Kimi K2.6, GLM-5.1, DeepSeek-V4-Pro, GPT-OSS, and Qwen3.5 return reasoning in a dedicated +`reasoning` field on the response message or streaming delta. + +Non-streaming (Python): + +```python +response = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[{"role": "user", "content": "Say test 10 times"}], +) +print("Reasoning:", response.choices[0].message.reasoning) +print("Answer:", response.choices[0].message.content) +``` + +Non-streaming (TypeScript): + +```typescript +const response = await together.chat.completions.create({ + model: "moonshotai/Kimi-K2.6", + messages: [{ role: "user", content: "Say test 10 times" }], +} as any); + +console.log("Reasoning:", (response.choices[0].message as any).reasoning); +console.log("Answer:", response.choices[0].message.content); +``` + +Streaming (Python): + +```python +stream = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[{"role": "user", "content": "Which number is bigger, 9.11 or 9.9?"}], + stream=True, +) + +for chunk in stream: + if chunk.choices: + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning") and delta.reasoning: + print(delta.reasoning, end="", flush=True) + if hasattr(delta, "content") and delta.content: + print(delta.content, end="", flush=True) +``` + +Streaming (TypeScript): + +```typescript +import type { ChatCompletionChunk } from "together-ai/resources/chat/completions"; + +const stream = await together.chat.completions.stream({ + model: "moonshotai/Kimi-K2.6", + messages: [ + { role: "user", content: "Which number is bigger, 9.11 or 9.9?" }, + ], +} as any); + +for await (const chunk of stream) { + const delta = chunk.choices[0]?.delta as ChatCompletionChunk.Choice.Delta & { + reasoning?: string; + }; + if (delta?.reasoning) process.stdout.write(delta.reasoning); + if (delta?.content) process.stdout.write(delta.content); +} +``` + +## Structured Outputs with Reasoning + +Reasoning models support JSON mode for structured output extraction: + +### Python + +```python +import json +from together import Together +from pydantic import BaseModel, Field + +client = Together() + +class Step(BaseModel): + explanation: str + output: str + +class MathReasoning(BaseModel): + steps: list[Step] + final_answer: str + +completion = client.chat.completions.create( + model="deepseek-ai/DeepSeek-V4-Pro", + messages=[ + { + "role": "system", + "content": "You are a helpful math tutor. Guide the user through the solution step by step.", + }, + {"role": "user", "content": "how can I solve 8x + 7 = -23"}, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "math_reasoning", + "schema": MathReasoning.model_json_schema(), + }, + }, +) + +math_reasoning = json.loads(completion.choices[0].message.content) +print(json.dumps(math_reasoning, indent=2)) +``` + +### TypeScript + +```typescript +import Together from "together-ai"; +import { z } from "zod"; + +const together = new Together(); + +const stepSchema = z.object({ + explanation: z.string(), + output: z.string(), +}); + +const mathReasoningSchema = z.object({ + steps: z.array(stepSchema), + final_answer: z.string(), +}); + +const jsonSchema = z.toJSONSchema(mathReasoningSchema); + +const completion = await together.chat.completions.create({ + model: "deepseek-ai/DeepSeek-V4-Pro", + messages: [ + { + role: "system", + content: + "You are a helpful math tutor. Guide the user through the solution step by step.", + }, + { role: "user", content: "how can I solve 8x + 7 = -23" }, + ], + response_format: { + type: "json_schema", + json_schema: { + name: "math_reasoning", + schema: jsonSchema, + }, + }, +}); + +if (completion?.choices?.[0]?.message?.content) { + const result = JSON.parse(completion.choices[0].message.content); + console.log(JSON.stringify(result, null, 2)); +} +``` + +## Best Practices by Model + +### DeepSeek-V4-Pro +- Hybrid reasoning model with very long context (512K) +- Toggle reasoning via `reasoning={"enabled": True/False}` +- Strong performance on math, code, and agentic tool use +- Avoid micromanaging reasoning steps -- let the model determine methodology + +### Kimi K2.6 +- Temperature 1.0 for thinking mode, 0.6 for instant mode +- Supports both reasoning and non-reasoning modes +- Excels at multi-turn tool calling with reasoning interleaved + +### GLM-5.1 / GLM-5 +- Thinking is enabled by default +- Supports Preserved Thinking: set `"clear_thinking": false` in `chat_template_kwargs` +- Preserved Thinking retains reasoning across turns for better agentic workflows + +### GPT-OSS +- Use `reasoning_effort` to control depth +- Set `max_tokens` to ~30,000 with `reasoning_effort="high"` +- Build Tier 1+ required + +### Nemotron 3 Ultra 550B A55B +- Hybrid reasoning model with 512K context +- Defaults to high reasoning effort +- To switch to medium effort, pass `chat_template_kwargs={"medium_effort": True}` instead of `reasoning_effort` diff --git a/plugins/togetherai/skills/together-chat-completions/references/structured-outputs.md b/plugins/togetherai/skills/together-chat-completions/references/structured-outputs.md new file mode 100644 index 00000000..79144e82 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/references/structured-outputs.md @@ -0,0 +1,542 @@ +# Structured Outputs Reference +## Contents + +- [Three Modes](#three-modes) +- [Structured Outputs with Reasoning Models](#structured-outputs-with-reasoning-models) +- [Streaming Structured Output](#streaming-structured-output) +- [Supported Models](#supported-models) +- [Troubleshooting](#troubleshooting) +- [Prompting Best Practices](#prompting-best-practices) + + +## Three Modes + +### 1. json_schema (Recommended) + +Constrains output to match your JSON schema exactly. Use Pydantic in Python and Zod in TypeScript +to define schemas. + +### Python + +```python +import json +from together import Together +from pydantic import BaseModel, Field + +client = Together() + +class VoiceNote(BaseModel): + title: str = Field(description="A title for the voice note") + summary: str = Field(description="A short one sentence summary of the voice note.") + actionItems: list[str] = Field(description="A list of action items from the voice note") + +transcript = ( + "Good morning! Today is going to be a busy day. First, I need to make a quick breakfast. " + "While cooking, I'll also check my emails to see if there's anything urgent." +) + +extract = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": ( + "The following is a voice message transcript. Only answer in JSON " + f"and follow this schema {json.dumps(VoiceNote.model_json_schema())}." + ), + }, + {"role": "user", "content": transcript}, + ], + model="openai/gpt-oss-20b", + response_format={ + "type": "json_schema", + "json_schema": { + "name": "voice_note", + "schema": VoiceNote.model_json_schema(), + }, + }, +) + +output = json.loads(extract.choices[0].message.content) +print(json.dumps(output, indent=2)) +``` + +Output: +```json +{ + "title": "Morning Routine", + "summary": "Starting the day with a quick breakfast and checking emails", + "actionItems": [ + "Cook scrambled eggs and toast", + "Brew a cup of coffee", + "Check emails for urgent messages" + ] +} +``` + +### TypeScript + +```typescript +import Together from "together-ai"; +import { z } from "zod"; + +const together = new Together(); + +const voiceNoteSchema = z.object({ + title: z.string().describe("A title for the voice note"), + summary: z + .string() + .describe("A short one sentence summary of the voice note."), + actionItems: z + .array(z.string()) + .describe("A list of action items from the voice note"), +}); +const jsonSchema = z.toJSONSchema(voiceNoteSchema); + +async function main() { + const transcript = + "Good morning! Today is going to be a busy day. First, I need to make a quick " + + "breakfast. While cooking, I'll also check my emails to see if there's anything urgent."; + + const extract = await together.chat.completions.create({ + messages: [ + { + role: "system", + content: `The following is a voice message transcript. Only answer in JSON and follow this schema ${JSON.stringify(jsonSchema)}.`, + }, + { role: "user", content: transcript }, + ], + model: "openai/gpt-oss-20b", + response_format: { + type: "json_schema", + json_schema: { + name: "voice_note", + schema: jsonSchema, + }, + }, + }); + + if (extract?.choices?.[0]?.message?.content) { + const output = JSON.parse(extract.choices[0].message.content); + console.log(output); + } +} + +main(); +``` + +### cURL + +```shell +curl -X POST "https://api.together.xyz/v1/chat/completions" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { + "role": "system", + "content": "The following is a voice message transcript. Only answer in JSON." + }, + { + "role": "user", + "content": "Good morning! Today is going to be a busy day. First, I need to make a quick breakfast. While cooking, I will also check my emails." + } + ], + "model": "openai/gpt-oss-20b", + "response_format": { + "type": "json_schema", + "schema": { + "properties": { + "title": { "type": "string", "description": "A title for the voice note" }, + "summary": { "type": "string", "description": "A short one sentence summary" }, + "actionItems": { + "items": { "type": "string" }, + "type": "array", + "description": "Action items" + } + }, + "required": ["title", "summary", "actionItems"], + "type": "object" + } + } + }' +``` + +### OpenAI SDK Compatibility + +```python +from pydantic import BaseModel +from openai import OpenAI +import os, json + +client = OpenAI( + api_key=os.environ.get("TOGETHER_API_KEY"), + base_url="https://api.together.xyz/v1", +) + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + +completion = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[ + {"role": "system", "content": "Extract the event information."}, + {"role": "user", "content": "Alice and Bob are going to a science fair on Friday. Answer in JSON"}, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "calendar_event", + "schema": CalendarEvent.model_json_schema(), + }, + }, +) + +output = json.loads(completion.choices[0].message.content) +print(json.dumps(output, indent=2)) +``` + +### 2. json_object (Simple) + +Model outputs valid JSON but structure is guided by prompt only. + +### Python + +```python +from together import Together + +client = Together() + +response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[ + {"role": "system", "content": "Respond in JSON with keys: name, age, city"}, + {"role": "user", "content": "Tell me about yourself"}, + ], + response_format={"type": "json_object"}, +) +print(response.choices[0].message.content) +``` + +### TypeScript + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const response = await together.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: [ + { role: "system", content: "Respond in JSON with keys: name, age, city" }, + { role: "user", content: "Tell me about yourself" }, + ], + response_format: { type: "json_object" }, +}); +console.log(response.choices[0].message.content); +``` + +### cURL + +```shell +curl -X POST "https://api.together.xyz/v1/chat/completions" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "openai/gpt-oss-20b", + "messages": [ + {"role": "system", "content": "Respond in JSON with keys: name, age, city"}, + {"role": "user", "content": "Tell me about yourself"} + ], + "response_format": {"type": "json_object"} + }' +``` + +### 3. regex (Pattern Matching) + +Constrains output to match a regex pattern. All models supported for JSON mode also support regex. + +### Python + +```python +from together import Together + +client = Together() + +# Sentiment classification +response = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + temperature=0.2, + max_tokens=10, + messages=[ + { + "role": "system", + "content": "Classify the sentiment of the text as positive, neutral, or negative.", + }, + {"role": "user", "content": "Wow. I loved the movie!"}, + ], + response_format={"type": "regex", "pattern": "(positive|neutral|negative)"}, +) +print(response.choices[0].message.content) # "positive" + +# Phone number pattern +response = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages=[{"role": "user", "content": "Generate a US phone number for a pizza shop"}], + response_format={"type": "regex", "pattern": r"\(\d{3}\) \d{3}-\d{4}"}, +) +``` + +### TypeScript + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const completion = await together.chat.completions.create({ + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + temperature: 0.2, + max_tokens: 10, + messages: [ + { + role: "system", + content: + "Classify the sentiment of the text as positive, neutral, or negative.", + }, + { role: "user", content: "Wow. I loved the movie!" }, + ], + response_format: { + type: "regex", + // @ts-ignore + pattern: "(positive|neutral|negative)", + }, +}); + +console.log(completion?.choices[0]?.message?.content); // "positive" +``` + +### cURL + +```shell +curl -X POST "https://api.together.xyz/v1/chat/completions" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "temperature": 0.2, + "max_tokens": 10, + "messages": [ + { + "role": "system", + "content": "Classify the sentiment of the text as positive, neutral, or negative." + }, + {"role": "user", "content": "Wow. I loved the movie!"} + ], + "response_format": {"type": "regex", "pattern": "(positive|neutral|negative)"} + }' +``` + +## Structured Outputs with Reasoning Models + +Some reasoning models support JSON mode. The model reasons internally then produces structured JSON. + +### Python + +```python +import json +from together import Together +from pydantic import BaseModel, Field + +client = Together() + +class Step(BaseModel): + explanation: str + output: str + +class MathReasoning(BaseModel): + steps: list[Step] + final_answer: str + +completion = client.chat.completions.create( + model="deepseek-ai/DeepSeek-V4-Pro", + messages=[ + { + "role": "system", + "content": "You are a helpful math tutor. Guide the user through the solution step by step.", + }, + {"role": "user", "content": "how can I solve 8x + 7 = -23"}, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "math_reasoning", + "schema": MathReasoning.model_json_schema(), + }, + }, +) + +math_reasoning = json.loads(completion.choices[0].message.content) +print(json.dumps(math_reasoning, indent=2)) +``` + +### TypeScript + +```typescript +import Together from "together-ai"; +import { z } from "zod"; + +const together = new Together(); + +const stepSchema = z.object({ + explanation: z.string(), + output: z.string(), +}); + +const mathReasoningSchema = z.object({ + steps: z.array(stepSchema), + final_answer: z.string(), +}); + +const jsonSchema = z.toJSONSchema(mathReasoningSchema); + +const completion = await together.chat.completions.create({ + model: "deepseek-ai/DeepSeek-V4-Pro", + messages: [ + { + role: "system", + content: + "You are a helpful math tutor. Guide the user through the solution step by step.", + }, + { role: "user", content: "how can I solve 8x + 7 = -23" }, + ], + response_format: { + type: "json_schema", + json_schema: { + name: "math_reasoning", + schema: jsonSchema, + }, + }, +}); + +if (completion?.choices?.[0]?.message?.content) { + const result = JSON.parse(completion.choices[0].message.content); + console.log(JSON.stringify(result, null, 2)); +} +``` + +## Streaming Structured Output + +You can combine `response_format` with `stream=True`. Tokens arrive incrementally (individual chunks +are not valid JSON), so accumulate all chunks and parse the final concatenated string. + +```python +import json +from together import Together +from pydantic import BaseModel, Field + +client = Together() + +class Summary(BaseModel): + title: str = Field(description="A short title") + bullets: list[str] = Field(description="Key points") + +schema = Summary.model_json_schema() + +stream = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[ + {"role": "system", "content": f"Respond in JSON matching: {json.dumps(schema)}"}, + {"role": "user", "content": "Summarize the benefits of exercise"}, + ], + response_format={ + "type": "json_schema", + "json_schema": {"name": "summary", "schema": schema}, + }, + stream=True, +) + +chunks: list[str] = [] +for chunk in stream: + token = chunk.choices[0].delta.content or "" + chunks.append(token) + print(token, end="", flush=True) +print() + +result = json.loads("".join(chunks)) +print(f"Title: {result['title']}") +print(f"Bullets: {result['bullets']}") +``` + +```typescript +import Together from "together-ai"; +import { z } from "zod"; + +const together = new Together(); + +const summarySchema = z.object({ + title: z.string().describe("A short title"), + bullets: z.array(z.string()).describe("Key points"), +}); +const jsonSchema = z.toJSONSchema(summarySchema); + +const stream = await together.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: [ + { role: "system", content: `Respond in JSON matching: ${JSON.stringify(jsonSchema)}` }, + { role: "user", content: "Summarize the benefits of exercise" }, + ], + response_format: { + type: "json_schema", + json_schema: { name: "summary", schema: jsonSchema }, + }, + stream: true, +}); + +const chunks: string[] = []; +for await (const chunk of stream) { + const token = chunk.choices[0]?.delta?.content || ""; + chunks.push(token); + process.stdout.write(token); +} + +const result = JSON.parse(chunks.join("")); +console.log(`Title: ${result.title}`); +``` + +## Supported Models + +### Top Models (json_schema, json_object, regex) +- `openai/gpt-oss-120b` +- `openai/gpt-oss-20b` +- `moonshotai/Kimi-K2.6` +- `zai-org/GLM-5.1` +- `zai-org/GLM-5` +- `MiniMaxAI/MiniMax-M2.7` +- `Qwen/Qwen3.5-397B-A17B` +- `Qwen/Qwen3.6-Plus` +- `deepseek-ai/DeepSeek-V4-Pro` + +### Additional Supported Models +- `meta-llama/Llama-3.3-70B-Instruct-Turbo` +- `Qwen/Qwen2.5-7B-Instruct-Turbo` +- `Qwen/Qwen3.5-9B` +- `google/gemma-4-31B-it` +- `google/gemma-3n-E4B-it` + +## Troubleshooting + +- Token limits: Check the max token limit of your model. Truncated output is a common issue. +- Malformed JSON: Validate your example JSON before using it in prompts. The model follows your + example exactly, including syntax errors. +- Common symptoms: Unterminated strings, repeated newlines, incomplete structures, or truncated + output with `stop` finish reason. + +## Prompting Best Practices + +1. Always tell the model to respond only in JSON in the system prompt +2. Include a plain-text copy of the schema in the prompt +3. Use `json_schema` mode when you need guaranteed structure +4. Use `json_object` for simpler cases where prompt guidance is sufficient +5. Use `regex` mode for simple constrained outputs (classification, IDs, phone numbers) +6. Works with vision models (e.g., `Qwen/Qwen3.5-397B-A17B`) +7. Works with reasoning models (e.g., `deepseek-ai/DeepSeek-V4-Pro`) diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/async_parallel.py b/plugins/togetherai/skills/together-chat-completions/scripts/async_parallel.py new file mode 100644 index 00000000..36133d76 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/async_parallel.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +Together AI Chat Completions - Async Parallel Requests (v2 SDK) + +Demonstrates using AsyncTogether to run multiple independent +chat completion requests in parallel. + +Usage: + python async_parallel.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import asyncio +from together import AsyncTogether + + +async def main() -> None: + client = AsyncTogether() + + prompts = [ + "What is the capital of France?", + "Write a haiku about the ocean", + "What is 42 * 37?", + "Name three programming languages created in the 1990s", + ] + + print(f"Sending {len(prompts)} requests in parallel...\n") + + tasks = [ + client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": prompt}], + max_tokens=150, + ) + for prompt in prompts + ] + + responses = await asyncio.gather(*tasks) + + for prompt, response in zip(prompts, responses): + answer = response.choices[0].message.content.strip() + print(f"Q: {prompt}") + print(f"A: {answer}\n") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.py b/plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.py new file mode 100644 index 00000000..12807a76 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +""" +Together AI Chat Completions - Basic Chat and Streaming (v2 SDK) + +Demonstrates single-query chat, streaming, and multi-turn conversation. + +Usage: + python chat_basic.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +from together import Together + +client = Together() + + +def basic_chat() -> None: + """Send a single chat completion request.""" + print("=== Basic Chat ===") + response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": "What are some fun things to do in NYC?"}], + ) + print(response.choices[0].message.content) + print() + + +def streaming_chat() -> None: + """Stream tokens incrementally.""" + print("=== Streaming ===") + stream = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": "Write a haiku about coding"}], + stream=True, + ) + for chunk in stream: + if chunk.choices: + print(chunk.choices[0].delta.content or "", end="", flush=True) + print("\n") + + +def multi_turn_chat() -> None: + """Multi-turn conversation with system prompt.""" + print("=== Multi-Turn ===") + messages = [ + {"role": "system", "content": "You are a helpful travel guide. Keep answers brief."}, + {"role": "user", "content": "What should I do in Paris?"}, + ] + + response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=messages, + ) + assistant_reply = response.choices[0].message.content + print(f"User: What should I do in Paris?") + print(f"Assistant: {assistant_reply}\n") + + # Continue the conversation + messages.append({"role": "assistant", "content": assistant_reply}) + messages.append({"role": "user", "content": "How about food recommendations?"}) + + response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=messages, + ) + print(f"User: How about food recommendations?") + print(f"Assistant: {response.choices[0].message.content}") + + +if __name__ == "__main__": + basic_chat() + streaming_chat() + multi_turn_chat() diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.ts b/plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.ts new file mode 100644 index 00000000..54d6bc0f --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/chat_basic.ts @@ -0,0 +1,77 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Chat Completions - Basic Chat and Streaming + * + * Demonstrates single-query chat, streaming, and multi-turn conversation. + * + * Usage: + * npx tsx chat_basic.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +async function basicChat(): Promise { + console.log("=== Basic Chat ==="); + const response = await client.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: [{ role: "user", content: "What are some fun things to do in NYC?" }], + }); + console.log(response.choices[0].message.content); + console.log(); +} + +async function streamingChat(): Promise { + console.log("=== Streaming ==="); + const stream = await client.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: [{ role: "user", content: "Write a haiku about coding" }], + stream: true, + }); + + for await (const chunk of stream) { + process.stdout.write(chunk.choices[0]?.delta?.content || ""); + } + console.log("\n"); +} + +async function multiTurnChat(): Promise { + console.log("=== Multi-Turn ==="); + const messages: { role: string; content: string }[] = [ + { role: "system", content: "You are a helpful travel guide. Keep answers brief." }, + { role: "user", content: "What should I do in Paris?" }, + ]; + + const response = await client.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: messages as any, + }); + const assistantReply = response.choices[0].message.content ?? ""; + console.log("User: What should I do in Paris?"); + console.log(`Assistant: ${assistantReply}\n`); + + messages.push({ role: "assistant", content: assistantReply }); + messages.push({ role: "user", content: "How about food recommendations?" }); + + const response2 = await client.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: messages as any, + }); + console.log("User: How about food recommendations?"); + console.log(`Assistant: ${response2.choices[0].message.content}`); +} + +async function main(): Promise { + await basicChat(); + await streamingChat(); + await multiTurnChat(); +} + +main(); diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.py b/plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.py new file mode 100644 index 00000000..1b3c21d6 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Together AI Chat Completions - Debug Headers and Raw Responses (v2 SDK) + +Inspect parsed chat output together with raw response headers for latency, routing, +and rate-limit debugging. + +Usage: + python debug_headers.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +from together import Together + +client = Together() + + +def main() -> None: + """Print a parsed chat response together with selected response headers.""" + response = client.chat.completions.with_raw_response.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": "Say hello in one sentence."}], + extra_headers={"x-together-debug": "1"}, + ) + + parsed = response.parse() + print("=== Parsed Response ===") + print(parsed.choices[0].message.content) + print() + + print("=== Selected Headers ===") + interesting_headers = [ + "x-request-id", + "x-together-traceid", + "x-cluster", + "x-engine-pod", + "x-api-received", + "x-api-call-start", + "x-api-call-end", + "x-inference-version", + "x-ratelimit-limit", + "x-ratelimit-remaining", + "x-ratelimit-reset", + "x-tokenlimit-limit", + "x-tokenlimit-remaining", + ] + header_map = dict(response.headers) + for key in interesting_headers: + if key in header_map: + print(f"{key}: {header_map[key]}") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.ts b/plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.ts new file mode 100644 index 00000000..8dce45bb --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/debug_headers.ts @@ -0,0 +1,64 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Chat Completions - Debug Headers and Raw Responses + * + * Inspect parsed chat output together with raw response headers for latency, + * routing, and rate-limit debugging. + * + * Usage: + * npx tsx debug_headers.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +async function main(): Promise { + const response = await client.chat.completions.create( + { + model: "openai/gpt-oss-20b", + messages: [{ role: "user", content: "Say hello in one sentence." }], + }, + { headers: { "x-together-debug": "1" } } + ).asResponse(); + + const parsed = await response.json(); + console.log("=== Parsed Response ==="); + console.log(parsed.choices[0].message.content); + console.log(); + + console.log("=== Selected Headers ==="); + const interestingHeaders = [ + "x-request-id", + "x-together-traceid", + "x-cluster", + "x-engine-pod", + "x-api-received", + "x-api-call-start", + "x-api-call-end", + "x-inference-version", + "x-ratelimit-limit", + "x-ratelimit-remaining", + "x-ratelimit-reset", + "x-tokenlimit-limit", + "x-tokenlimit-remaining", + ]; + + for (const key of interestingHeaders) { + const value = response.headers.get(key); + if (value) { + console.log(`${key}: ${value}`); + } + } +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.py b/plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.py new file mode 100644 index 00000000..223c807d --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +Together AI Chat Completions - Reasoning Models (v2 SDK) + +Demonstrates reasoning with separate reasoning field, DeepSeek R1 tags, +reasoning effort control, and enabling/disabling reasoning on hybrid models. + +Usage: + python reasoning_models.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import re +from together import Together + +client = Together() + + +def reasoning_field_streaming() -> None: + """Most reasoning models return a separate `reasoning` field.""" + print("=== Reasoning Field (Kimi K2.6 streaming) ===") + stream = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[ + {"role": "user", "content": "Which number is bigger, 9.11 or 9.9?"}, + ], + stream=True, + ) + + reasoning_text = "" + content_text = "" + for chunk in stream: + if chunk.choices: + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning") and delta.reasoning: + reasoning_text += delta.reasoning + if hasattr(delta, "content") and delta.content: + content_text += delta.content + + print(f"Reasoning: {reasoning_text[:200]}...") + print(f"Answer: {content_text}") + print() + + +def reasoning_field_non_streaming() -> None: + """Non-streaming access to reasoning field.""" + print("=== Reasoning Field (non-streaming) ===") + response = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[{"role": "user", "content": "What is 15% of 240?"}], + ) + print(f"Reasoning: {response.choices[0].message.reasoning[:200]}...") + print(f"Answer: {response.choices[0].message.content}") + print() + + +def deepseek_r1_think_tags() -> None: + """DeepSeek R1 outputs reasoning in tags within content.""" + print("=== DeepSeek R1 ( tags) ===") + stream = client.chat.completions.create( + model="deepseek-ai/DeepSeek-V4-Pro", + messages=[ + {"role": "user", "content": "Which number is bigger 9.9 or 9.11?"}, + ], + stream=True, + ) + + full_content = "" + for chunk in stream: + content = chunk.choices[0].delta.content or "" + full_content += content + + # Parse tags + think_match = re.search(r"(.*?)", full_content, re.DOTALL) + thinking = think_match.group(1).strip() if think_match else "" + answer = re.sub(r".*?", "", full_content, flags=re.DOTALL).strip() + + print(f"Thinking: {thinking[:200]}...") + print(f"Answer: {answer}") + print() + + +def reasoning_effort_example() -> None: + """Control reasoning depth with reasoning_effort (GPT-OSS).""" + print("=== Reasoning Effort (GPT-OSS) ===") + for effort in ["low", "medium", "high"]: + stream = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": "Is 17 a prime number?"}], + temperature=1.0, + top_p=1.0, + reasoning_effort=effort, + stream=True, + ) + + content = "" + for chunk in stream: + content += chunk.choices[0].delta.content or "" + + print(f" effort={effort}: {content[:100]}...") + print() + + +def toggle_reasoning() -> None: + """Enable/disable reasoning on hybrid models.""" + print("=== Toggle Reasoning (Kimi K2.6) ===") + + # Reasoning enabled (thinking mode) + print(" [reasoning=True]") + stream = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning={"enabled": True}, + temperature=1.0, + stream=True, + ) + + reasoning_text = "" + content_text = "" + for chunk in stream: + if chunk.choices: + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning") and delta.reasoning: + reasoning_text += delta.reasoning + if hasattr(delta, "content") and delta.content: + content_text += delta.content + + print(f" Reasoning tokens: {len(reasoning_text)} chars") + print(f" Answer: {content_text[:100]}") + + # Reasoning disabled (instant mode) + print(" [reasoning=False]") + response = client.chat.completions.create( + model="moonshotai/Kimi-K2.6", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning={"enabled": False}, + temperature=0.6, + ) + print(f" Answer: {response.choices[0].message.content[:100]}") + + +if __name__ == "__main__": + reasoning_field_streaming() + reasoning_field_non_streaming() + deepseek_r1_think_tags() + reasoning_effort_example() + toggle_reasoning() diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.ts b/plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.ts new file mode 100644 index 00000000..db391763 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/reasoning_models.ts @@ -0,0 +1,162 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Chat Completions - Reasoning Models + * + * Demonstrates reasoning with separate reasoning field, DeepSeek R1 tags, + * reasoning effort control, and enabling/disabling reasoning on hybrid models. + * + * Usage: + * npx tsx reasoning_models.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; +import type { + ChatCompletionChunk, + ChatCompletionMessageParam, + CompletionCreateParamsStreaming, +} from "together-ai/resources/chat/completions"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +type ReasoningDelta = ChatCompletionChunk.Choice.Delta & { + reasoning?: string; +}; + +type ReasoningParams = CompletionCreateParamsStreaming & { + reasoning?: { enabled: boolean }; +}; + +// --- 1. Reasoning field (streaming) --- +async function reasoningFieldStreaming(): Promise { + console.log("=== Reasoning Field (Kimi K2.6 streaming) ==="); + + const stream = await client.chat.completions.stream({ + model: "moonshotai/Kimi-K2.6", + messages: [ + { role: "user", content: "Which number is bigger, 9.11 or 9.9?" }, + ], + }); + + let reasoningText = ""; + let contentText = ""; + + for await (const chunk of stream) { + const delta = chunk.choices[0]?.delta as ReasoningDelta; + if (delta?.reasoning) reasoningText += delta.reasoning; + if (delta?.content) contentText += delta.content; + } + + console.log(`Reasoning: ${reasoningText.slice(0, 200)}...`); + console.log(`Answer: ${contentText}`); + console.log(); +} + +// --- 2. DeepSeek R1 ( tags) --- +async function deepseekR1ThinkTags(): Promise { + console.log("=== DeepSeek R1 ( tags) ==="); + + const stream = await client.chat.completions.create({ + model: "deepseek-ai/DeepSeek-V4-Pro", + messages: [ + { role: "user", content: "Which number is bigger 9.9 or 9.11?" }, + ], + stream: true, + }); + + let fullContent = ""; + for await (const chunk of stream) { + fullContent += chunk.choices[0]?.delta?.content || ""; + } + + // Parse tags + const thinkMatch = fullContent.match(/([\s\S]*?)<\/think>/); + const thinking = thinkMatch ? thinkMatch[1].trim() : ""; + const answer = fullContent.replace(/[\s\S]*?<\/think>/, "").trim(); + + console.log(`Thinking: ${thinking.slice(0, 200)}...`); + console.log(`Answer: ${answer}`); + console.log(); +} + +// --- 3. Reasoning effort (GPT-OSS) --- +async function reasoningEffortExample(): Promise { + console.log("=== Reasoning Effort (GPT-OSS) ==="); + + for (const effort of ["low", "medium", "high"] as const) { + const stream = await client.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: [{ role: "user", content: "Is 17 a prime number?" }], + temperature: 1.0, + top_p: 1.0, + reasoning_effort: effort, + stream: true, + }); + + let content = ""; + for await (const chunk of stream) { + content += chunk.choices[0]?.delta?.content || ""; + } + + console.log(` effort=${effort}: ${content.slice(0, 100)}...`); + } + console.log(); +} + +// --- 4. Toggle reasoning on hybrid models --- +async function toggleReasoning(): Promise { + console.log("=== Toggle Reasoning (Kimi K2.6) ==="); + + // Reasoning enabled (thinking mode) + console.log(" [reasoning=true]"); + const enabledParams: ReasoningParams = { + model: "moonshotai/Kimi-K2.6", + messages: [ + { role: "user", content: "What is the capital of France?" }, + ], + reasoning: { enabled: true }, + temperature: 1.0, + stream: true, + }; + + const stream = await client.chat.completions.create(enabledParams); + + let reasoningText = ""; + let contentText = ""; + for await (const chunk of stream) { + const delta = chunk.choices[0]?.delta as ReasoningDelta; + if (delta?.reasoning) reasoningText += delta.reasoning; + if (delta?.content) contentText += delta.content; + } + + console.log(` Reasoning tokens: ${reasoningText.length} chars`); + console.log(` Answer: ${contentText.slice(0, 100)}`); + + // Reasoning disabled (instant mode) + console.log(" [reasoning=false]"); + const disabledParams = { + model: "moonshotai/Kimi-K2.6", + messages: [ + { role: "user", content: "What is the capital of France?" }, + ] as ChatCompletionMessageParam[], + reasoning: { enabled: false }, + temperature: 0.6, + }; + + const response = await client.chat.completions.create(disabledParams); + console.log(` Answer: ${response.choices[0].message.content?.slice(0, 100)}`); +} + +async function main(): Promise { + await reasoningFieldStreaming(); + await deepseekR1ThinkTags(); + await reasoningEffortExample(); + await toggleReasoning(); +} + +main(); diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.py b/plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.py new file mode 100644 index 00000000..19676e62 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +""" +Together AI Chat Completions - Structured Outputs (v2 SDK) + +Demonstrates json_schema, json_object, and regex response formats. + +Usage: + python structured_outputs.py + +Requires: + uv pip install "together>=2.0.0" pydantic + export TOGETHER_API_KEY=your_key +""" + +import json +from together import Together +from pydantic import BaseModel, Field + +client = Together() + + +# --- 1. json_schema with Pydantic --- +class VoiceNote(BaseModel): + title: str = Field(description="A title for the voice note") + summary: str = Field(description="A short one sentence summary of the voice note.") + actionItems: list[str] = Field(description="A list of action items from the voice note") + + +def json_schema_example() -> None: + """Constrain output to match a Pydantic schema exactly.""" + print("=== json_schema (Pydantic) ===") + transcript = ( + "Good morning! Today is going to be a busy day. First, I need to make a quick " + "breakfast. While cooking, I'll also check my emails to see if there's anything urgent. " + "Then I have a meeting at 10am to discuss the Q4 roadmap." + ) + + extract = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": ( + "The following is a voice message transcript. Only answer in JSON " + f"and follow this schema {json.dumps(VoiceNote.model_json_schema())}." + ), + }, + {"role": "user", "content": transcript}, + ], + model="openai/gpt-oss-20b", + response_format={ + "type": "json_schema", + "json_schema": { + "name": "voice_note", + "schema": VoiceNote.model_json_schema(), + }, + }, + ) + + output = json.loads(extract.choices[0].message.content) + print(json.dumps(output, indent=2)) + print() + + +# --- 2. json_object (simple) --- +def json_object_example() -> None: + """Model outputs valid JSON, structure guided by prompt only.""" + print("=== json_object (simple) ===") + response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[ + {"role": "system", "content": "Respond in JSON with keys: name, age, city, hobby"}, + {"role": "user", "content": "Make up a character for a story"}, + ], + response_format={"type": "json_object"}, + ) + output = json.loads(response.choices[0].message.content) + print(json.dumps(output, indent=2)) + print() + + +# --- 3. regex (pattern matching) --- +def regex_example() -> None: + """Constrain output to match a regex pattern.""" + print("=== regex (classification) ===") + response = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + temperature=0.2, + max_tokens=10, + messages=[ + { + "role": "system", + "content": "Classify the sentiment of the text as positive, neutral, or negative.", + }, + {"role": "user", "content": "The food was absolutely amazing, best meal I've ever had!"}, + ], + response_format={"type": "regex", "pattern": "(positive|neutral|negative)"}, + ) + print(f"Sentiment: {response.choices[0].message.content}") + print() + + +# --- 4. json_schema with reasoning model --- +class Step(BaseModel): + explanation: str + output: str + + +class MathReasoning(BaseModel): + steps: list[Step] + final_answer: str + + +def reasoning_json_example() -> None: + """Extract structured JSON from a reasoning model.""" + print("=== json_schema + reasoning model ===") + completion = client.chat.completions.create( + model="deepseek-ai/DeepSeek-V4-Pro", + messages=[ + { + "role": "system", + "content": "You are a helpful math tutor. Guide the user through the solution step by step.", + }, + {"role": "user", "content": "how can I solve 8x + 7 = -23"}, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "math_reasoning", + "schema": MathReasoning.model_json_schema(), + }, + }, + ) + + result = json.loads(completion.choices[0].message.content) + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + json_schema_example() + json_object_example() + regex_example() + reasoning_json_example() diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.ts b/plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.ts new file mode 100644 index 00000000..e3de8ce6 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/structured_outputs.ts @@ -0,0 +1,159 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Chat Completions - Structured Outputs + * + * Demonstrates json_schema (with Zod), json_object, and regex response formats. + * + * Usage: + * npx tsx structured_outputs.ts + * + * Requires: + * npm install together-ai zod + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; +import { z } from "zod"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +// --- 1. json_schema with Zod --- +async function jsonSchemaExample(): Promise { + console.log("=== json_schema (Zod) ==="); + + const voiceNoteSchema = z.object({ + title: z.string().describe("A title for the voice note"), + summary: z.string().describe("A short one sentence summary of the voice note."), + actionItems: z.array(z.string()).describe("A list of action items from the voice note"), + }); + const jsonSchema = z.toJSONSchema(voiceNoteSchema); + + const transcript = + "Good morning! Today is going to be a busy day. First, I need to make a quick " + + "breakfast. While cooking, I'll also check my emails to see if there's anything " + + "urgent. Then I have a meeting at 10am to discuss the Q4 roadmap."; + + const extract = await client.chat.completions.create({ + messages: [ + { + role: "system", + content: `The following is a voice message transcript. Only answer in JSON and follow this schema ${JSON.stringify(jsonSchema)}.`, + }, + { role: "user", content: transcript }, + ], + model: "openai/gpt-oss-20b", + response_format: { + type: "json_schema", + json_schema: { + name: "voice_note", + schema: jsonSchema, + }, + }, + }); + + if (extract?.choices?.[0]?.message?.content) { + const output = JSON.parse(extract.choices[0].message.content); + console.log(JSON.stringify(output, null, 2)); + } + console.log(); +} + +// --- 2. json_object (simple) --- +async function jsonObjectExample(): Promise { + console.log("=== json_object (simple) ==="); + + const response = await client.chat.completions.create({ + model: "openai/gpt-oss-20b", + messages: [ + { role: "system", content: "Respond in JSON with keys: name, age, city, hobby" }, + { role: "user", content: "Make up a character for a story" }, + ], + response_format: { type: "json_object" }, + }); + + if (response?.choices?.[0]?.message?.content) { + const output = JSON.parse(response.choices[0].message.content); + console.log(JSON.stringify(output, null, 2)); + } + console.log(); +} + +// --- 3. regex (classification) --- +async function regexExample(): Promise { + console.log("=== regex (classification) ==="); + + // The current TS SDK types do not yet expose regex response_format. + const response = await client.chat.completions.create( + { + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + temperature: 0.2, + max_tokens: 10, + messages: [ + { + role: "system", + content: "Classify the sentiment of the text as positive, neutral, or negative.", + }, + { + role: "user", + content: "The food was absolutely amazing, best meal I've ever had!", + }, + ], + response_format: { + type: "regex", + pattern: "(positive|neutral|negative)", + }, + } as any + ); + + console.log(`Sentiment: ${response?.choices[0]?.message?.content}`); + console.log(); +} + +// --- 4. json_schema with reasoning model --- +async function reasoningJsonExample(): Promise { + console.log("=== json_schema + reasoning model ==="); + + const stepSchema = z.object({ + explanation: z.string(), + output: z.string(), + }); + const mathReasoningSchema = z.object({ + steps: z.array(stepSchema), + final_answer: z.string(), + }); + const jsonSchema = z.toJSONSchema(mathReasoningSchema); + + const completion = await client.chat.completions.create({ + model: "deepseek-ai/DeepSeek-V4-Pro", + messages: [ + { + role: "system", + content: "You are a helpful math tutor. Guide the user through the solution step by step.", + }, + { role: "user", content: "how can I solve 8x + 7 = -23" }, + ], + response_format: { + type: "json_schema", + json_schema: { + name: "math_reasoning", + schema: jsonSchema, + }, + }, + }); + + if (completion?.choices?.[0]?.message?.content) { + const result = JSON.parse(completion.choices[0].message.content); + console.log(JSON.stringify(result, null, 2)); + } +} + +async function main(): Promise { + await jsonSchemaExample(); + await jsonObjectExample(); + await regexExample(); + await reasoningJsonExample(); +} + +main(); diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.py b/plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.py new file mode 100644 index 00000000..f70e4758 --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +""" +Together AI Function Calling - Complete Tool Call Loop (v2 SDK) + +Defines tools, sends a request, executes function calls, and passes +results back to the model for a final response. Handles parallel calls. + +Usage: + python tool_call_loop.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import json +from together import Together + +client = Together() + +# --- 1. Define tools --- +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a city", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City name, e.g. 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_stock_price", + "description": "Get the current stock price for a ticker symbol", + "parameters": { + "type": "object", + "properties": { + "symbol": {"type": "string", "description": "Stock ticker, e.g. 'AAPL'"}, + }, + "required": ["symbol"], + }, + }, + }, +] + + +# --- 2. Implement your functions --- +def get_weather(location: str, unit: str = "fahrenheit") -> dict: + """Replace with real API call.""" + return {"location": location, "temperature": 72, "unit": unit, "condition": "sunny"} + + +def get_stock_price(symbol: str) -> dict: + """Replace with real API call.""" + return {"symbol": symbol, "price": 185.50, "currency": "USD"} + + +FUNCTIONS = { + "get_weather": get_weather, + "get_stock_price": get_stock_price, +} + + +def main() -> None: + # --- 3. Send request with tools --- + messages = [ + {"role": "system", "content": "You are a helpful assistant with access to weather and stock tools."}, + {"role": "user", "content": "What's the weather in NYC and the current Apple stock price?"}, + ] + + response = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages=messages, + tools=tools, + ) + + # --- 4. Process tool calls (handles parallel calls) --- + tool_calls = response.choices[0].message.tool_calls + + if tool_calls: + # Add assistant message with tool calls to history + messages.append(response.choices[0].message) + + for tc in tool_calls: + fn_name = tc.function.name + fn_args = json.loads(tc.function.arguments) + + print(f"Calling {fn_name}({fn_args})") + result = FUNCTIONS[fn_name](**fn_args) + + # Add each tool result to history + messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": json.dumps(result), + }) + + # --- 5. Get final response with tool results --- + final = client.chat.completions.create( + model="meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages=messages, + tools=tools, + ) + print(f"\nAssistant: {final.choices[0].message.content}") + else: + # Model responded directly without calling tools + print(f"Assistant: {response.choices[0].message.content}") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.ts b/plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.ts new file mode 100644 index 00000000..7653a5fd --- /dev/null +++ b/plugins/togetherai/skills/together-chat-completions/scripts/tool_call_loop.ts @@ -0,0 +1,154 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Function Calling - Complete Tool Call Loop + * + * Defines tools, sends a request, executes function calls, and passes + * results back to the model for a final response. Handles parallel calls. + * + * Usage: + * npx tsx tool_call_loop.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; +import type { + ChatCompletionMessageParam, + ChatCompletionTool, +} from "together-ai/resources/chat/completions"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +// --- 1. Define tools --- +const tools: ChatCompletionTool[] = [ + { + type: "function", + function: { + name: "getWeather", + description: "Get the current weather in a city", + parameters: { + type: "object", + properties: { + location: { + type: "string", + description: "City name, e.g. 'San Francisco, CA'", + }, + unit: { type: "string", enum: ["celsius", "fahrenheit"] }, + }, + required: ["location"], + }, + }, + }, + { + type: "function", + function: { + name: "getStockPrice", + description: "Get the current stock price for a ticker symbol", + parameters: { + type: "object", + properties: { + symbol: { + type: "string", + description: "Stock ticker, e.g. 'AAPL'", + }, + }, + required: ["symbol"], + }, + }, + }, +]; + +// --- 2. Implement your functions --- +function getWeather(args: { + location: string; + unit?: string; +}): Record { + // Replace with real API call + return { + location: args.location, + temperature: 72, + unit: args.unit ?? "fahrenheit", + condition: "sunny", + }; +} + +function getStockPrice(args: { symbol: string }): Record { + // Replace with real API call + return { symbol: args.symbol, price: 185.5, currency: "USD" }; +} + +const functions: Record< + string, + (args: any) => Record +> = { + getWeather, + getStockPrice, +}; + +// --- 3. Send request with tools --- +async function main(): Promise { + const messages: ChatCompletionMessageParam[] = [ + { + role: "system", + content: "You are a helpful assistant with access to weather and stock tools.", + }, + { + role: "user", + content: "What's the weather in NYC and the current Apple stock price?", + }, + ]; + + const response = await client.chat.completions.create({ + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages, + tools, + }); + + // --- 4. Process tool calls (handles parallel calls) --- + const assistantMessage = response.choices[0]?.message; + if (!assistantMessage) { + throw new Error("Model returned no assistant message."); + } + const toolCalls = assistantMessage.tool_calls ?? []; + + if (toolCalls.length > 0) { + // Add assistant message with tool calls to history + messages.push(assistantMessage); + + for (const tc of toolCalls) { + const fnName = tc.function.name; + const fnArgs = JSON.parse(tc.function.arguments); + const fn = functions[fnName]; + if (!fn) { + throw new Error(`No implementation found for tool: ${fnName}`); + } + + console.log(`Calling ${fnName}(${JSON.stringify(fnArgs)})`); + const result = fn(fnArgs); + + // Add each tool result to history + messages.push({ + role: "tool", + tool_call_id: tc.id, + content: JSON.stringify(result), + }); + } + + // --- 5. Get final response with tool results --- + const final = await client.chat.completions.create({ + model: "meta-llama/Llama-3.3-70B-Instruct-Turbo", + messages, + tools, + }); + console.log(`\nAssistant: ${final.choices[0].message.content}`); + } else { + // Model responded directly without calling tools + console.log(`Assistant: ${response.choices[0].message.content}`); + } +} + +main(); diff --git a/plugins/togetherai/skills/together-dedicated-containers/SKILL.md b/plugins/togetherai/skills/together-dedicated-containers/SKILL.md new file mode 100644 index 00000000..4c7e1682 --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-containers/SKILL.md @@ -0,0 +1,70 @@ +--- +name: together-dedicated-containers +description: "Custom Dockerized inference workers on Together AI's managed GPU infrastructure. Build with Sprocket SDK, configure with Jig CLI, submit async queue jobs, and poll results. Reach for it whenever the user needs container-level control rather than a standard model endpoint or raw cluster." +--- + +# Together Dedicated Containers + +## Overview + +Use Dedicated Container Inference when the user needs a custom runtime, not just managed model +hosting. + +Core building blocks: + +- Jig CLI for build and deployment +- Sprocket SDK for request handling inside the container +- Queue API for async jobs + +## When This Skill Wins + +- Deploy a custom inference worker +- Bundle custom dependencies or runtime logic into a container +- Use queue-based async processing with progress tracking +- Run a specialized image, video, or multimodal pipeline + +## Hand Off To Another Skill + +- Use `together-dedicated-endpoints` for standard model hosting without custom containers +- Use `together-gpu-clusters` for full cluster ownership and orchestration control +- Use `together-chat-completions`, `together-images`, or `together-video` when a serverless product already covers the task + +## Quick Routing + +- Minimal worker template + - Start with [scripts/sprocket_hello_world.py](scripts/sprocket_hello_world.py) + - Read [references/sprocket-sdk.md](references/sprocket-sdk.md) +- Build, deploy, logs, queue, and secrets + - Read [references/jig-cli.md](references/jig-cli.md) +- Queue submission and polling + - Start with [scripts/queue_client.py](scripts/queue_client.py) or [scripts/queue_client.ts](scripts/queue_client.ts) + +## Workflow + +1. Confirm that the user truly needs a custom container runtime. +2. Implement the worker with Sprocket's request lifecycle. +3. Configure `pyproject.toml` for image, runtime, autoscaling, and mounts. +4. Deploy with Jig. +5. Submit jobs through the queue API and poll until completion. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Prefer dedicated endpoints over containers unless the runtime or pipeline is genuinely custom. +- Treat the worker contract and `pyproject.toml` as the source of truth for deployment behavior. +- Parameterize deployment name, queue inputs, and resource sizing instead of hardcoding them. +- Queue-based jobs are asynchronous by default; account for polling and result retrieval in client code. + +## Resource Map + +- Jig CLI: [references/jig-cli.md](references/jig-cli.md) +- Sprocket SDK: [references/sprocket-sdk.md](references/sprocket-sdk.md) +- Python queue client: [scripts/queue_client.py](scripts/queue_client.py) +- TypeScript queue client: [scripts/queue_client.ts](scripts/queue_client.ts) +- Worker template: [scripts/sprocket_hello_world.py](scripts/sprocket_hello_world.py) + +## Official Docs + +- [Dedicated Container Inference](https://docs.together.ai/docs/dedicated-container-inference) +- [Containers Quickstart](https://docs.together.ai/docs/containers-quickstart) +- [Deployments API](https://docs.together.ai/reference/deployments-create) diff --git a/plugins/togetherai/skills/together-dedicated-containers/references/jig-cli.md b/plugins/togetherai/skills/together-dedicated-containers/references/jig-cli.md new file mode 100644 index 00000000..588fa36f --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-containers/references/jig-cli.md @@ -0,0 +1,526 @@ +# Jig CLI Reference +## Contents + +- [Installation](#installation) +- [Environment Variables](#environment-variables) +- [Build Commands](#build-commands) +- [Deployment Commands](#deployment-commands) +- [Queue Commands](#queue-commands) +- [Queue API](#queue-api) +- [Secrets Commands](#secrets-commands) +- [Volumes Commands](#volumes-commands) +- [Configuration (pyproject.toml)](#configuration) +- [Full Example](#full-example) +- [Container Registry](#container-registry) +- [Debug Mode](#debug-mode) + + +## Installation + +```shell +uv pip install "together>=2.0.0" +# or +uv tool install together +``` + +Jig commands are under `together beta jig`. + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `TOGETHER_API_KEY` | Required | Your Together API key | +| `TOGETHER_DEBUG` | `""` | Enable debug logging (`"1"` or `"true"`) | +| `WARMUP_ENV_NAME` | `TORCHINDUCTOR_CACHE_DIR` | Environment variable for cache location | +| `WARMUP_DEST` | `torch_cache` | Cache directory path in container | + +All commands are subcommands of `together beta jig`. Use `--config ` to specify a custom config file (default: `pyproject.toml`). + +## Build Commands + +### jig init + +Create a starter `pyproject.toml` with sensible defaults. + +```shell +together beta jig init +``` + +### jig dockerfile + +Generate a Dockerfile from your `pyproject.toml` configuration. Useful for debugging the build. + +```shell +together beta jig dockerfile +``` + +### jig build + +Build the Docker image locally. + +```shell +together beta jig build [flags] +``` + +| Flag | Description | +|------|-------------| +| `--tag ` | Image tag (default: content-hash) | +| `--warmup` | Pre-generate compile caches after build (requires GPU) | + +### jig push + +Push the built image to Together's registry at `registry.together.xyz`. + +```shell +together beta jig push [flags] +``` + +| Flag | Description | +|------|-------------| +| `--tag ` | Image tag to push | + +## Deployment Commands + +### jig deploy + +Build, push, and create or update the deployment. Combines `build`, `push`, and deployment creation into one step. + +```shell +together beta jig deploy [flags] +``` + +| Flag | Description | +|------|-------------| +| `--tag ` | Image tag | +| `--warmup` | Pre-generate compile caches (requires GPU) | +| `--build-only` | Build and push only, skip deployment creation | +| `--image ` | Deploy an existing image, skip build and push | + +### jig status + +Show deployment status and configuration. + +```shell +together beta jig status +``` + +### jig list + +List all deployments in your organization. + +```shell +together beta jig list +``` + +### jig logs + +Retrieve deployment logs. + +```shell +together beta jig logs [flags] +``` + +| Flag | Description | +|------|-------------| +| `--follow` | Stream logs in real-time | + +### jig endpoint + +Print the deployment's endpoint URL. + +```shell +together beta jig endpoint +``` + +### jig destroy + +Delete the deployment. + +```shell +together beta jig destroy +``` + +## Queue Commands + +### jig submit + +Submit a job to the deployment's queue. + +```shell +together beta jig submit [flags] +``` + +| Flag | Description | +|------|-------------| +| `--prompt ` | Shorthand for `--payload '{"prompt": "..."}'` | +| `--payload ` | Full JSON payload | +| `--watch` | Wait for the job to complete and print the result | + +Example: + +```shell +together beta jig submit --payload '{"prompt": "A cat playing piano"}' --watch +``` + +### jig job_status + +Get the status of a submitted job. + +```shell +together beta jig job_status --request-id +``` + +| Flag | Description | +|------|-------------| +| `--request-id ` | The job's request ID (required) | + +### jig queue_status + +Show queue backlog and worker status. + +```shell +together beta jig queue_status +``` + +## Queue API + +### Python (v2 SDK) + +```python +from together import Together +client = Together() + +# Submit +job = client.beta.jig.queue.submit(model="my-deployment", payload={"prompt": "Hello"}, priority=1) + +# Poll status +status = client.beta.jig.queue.retrieve(request_id=job.request_id, model="my-deployment") +``` + +### TypeScript + +```typescript +import Together from "together-ai"; +const client = new Together(); + +// Submit +const job = await client.beta.jig.queue.submit({ model: "my-deployment", payload: { prompt: "Hello" }, priority: 1 }); + +// Poll status +const status = await client.beta.jig.queue.retrieve({ request_id: job.requestId!, model: "my-deployment" }); +``` + +### cURL + +Submit a job: + +```shell +curl -X POST "https://api.together.ai/v1/queue/submit" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "my-deployment", "payload": {"prompt": "Hello world"}, "priority": 1}' +``` + +Poll job status: + +```shell +curl "https://api.together.ai/v1/queue/status?model=my-deployment&request_id=req_abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +Cancel a job: + +```shell +curl -X POST "https://api.together.ai/v1/queue/cancel" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "my-deployment", "request_id": "req_abc123"}' +``` + +Queue metrics: + +```shell +curl "https://api.together.ai/v1/queue/metrics?model=my-deployment" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +Health check: + +```shell +curl https://api.together.ai/v1/deployment-request/my-deployment/health \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +### Queue Submit Request + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `model` | string | Yes | Deployment name | +| `payload` | object | Yes | Freeform model input (passed to predict()) | +| `priority` | integer | No | Higher values process first (default: 0) | +| `info` | object | No | Arbitrary metadata stored with the job | + +### Queue Status Response + +| Field | Type | Description | +|-------|------|-------------| +| `request_id` | string | Job identifier | +| `model` | string | Deployment name | +| `status` | string | `pending`, `running`, `done`, `failed`, `canceled` | +| `outputs` | object | Model output (when done) | +| `info` | object | Job metadata (including emit_info updates) | +| `priority` | integer | Job priority | +| `retries` | integer | Retry count (fails after 3) | +| `created_at` | datetime | Submission time | +| `claimed_at` | datetime | Worker claim time | +| `done_at` | datetime | Completion time | + +## Secrets Commands + +Secrets are encrypted environment variables injected at runtime. + +### jig secrets set + +```shell +together beta jig secrets set --name --value [flags] +``` + +| Flag | Description | +|------|-------------| +| `--name ` | Secret name (required) | +| `--value ` | Secret value (required) | +| `--description ` | Human-readable description | + +Example: + +```shell +together beta jig secrets set --name HF_TOKEN --value hf_xxxxx --description "Hugging Face token" +``` + +### jig secrets list + +List all secrets for the deployment. + +```shell +together beta jig secrets list +``` + +### jig secrets unset + +Remove a secret. + +```shell +together beta jig secrets unset +``` + +## Volumes Commands + +Volumes mount read-only data (such as model weights) into your container without baking them into the image. + +### jig volumes create + +Create a volume and upload files. + +```shell +together beta jig volumes create --name --source +``` + +| Flag | Description | +|------|-------------| +| `--name ` | Volume name (required) | +| `--source ` | Local directory to upload (required) | + +Example: + +```shell +together beta jig volumes create --name my-weights --source ./model_weights/ +``` + +### jig volumes update + +Update a volume with new files. + +```shell +together beta jig volumes update --name --source +``` + +### jig volumes describe + +Show volume details and contents. + +```shell +together beta jig volumes describe --name +``` + +### jig volumes list + +List all volumes. + +```shell +together beta jig volumes list +``` + +### jig volumes delete + +Delete a volume. + +```shell +together beta jig volumes delete --name +``` + +Mount a volume by adding to your `pyproject.toml`: + +```toml +[[tool.jig.volume_mounts]] +name = "my-weights" +mount_path = "/models" +``` + +## Configuration (pyproject.toml) + +Jig reads configuration from your `pyproject.toml` file or a standalone `jig.toml` file. You can also specify a custom config file explicitly: + +```shell +together beta jig --config staging_jig.toml deploy +``` + +This is useful for managing multiple environments (e.g., `staging_jig.toml`, `production_jig.toml`). + +### `[tool.jig.image]` -- Build Settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| `python_version` | string | `"3.11"` | Python version for the container base image | +| `system_packages` | string[] | `[]` | APT packages to install (e.g., `ffmpeg`, `git`, `libgl1`) | +| `environment` | object | `{}` | Build-time + runtime env vars (set as `ENV` directives) | +| `run` | string[] | `[]` | Extra shell commands during build (each becomes a `RUN` instruction). See [CUDA PyTorch note](#cuda-pytorch) | +| `cmd` | string | `"python app.py"` | Container startup command (Docker `CMD`). Include `--queue` for Sprocket | +| `copy` | string[] | `[]` | Files and directories to include in container | +| `auto_include_git` | bool | `false` | Auto-include git-tracked files (requires clean repo) | + +### CUDA PyTorch + +The Jig base image (`python:3.11-slim`) does not include CUDA. A plain `torch>=2.0` dependency +installs CPU-only PyTorch, so `torch.cuda.is_available()` will be `False` even on GPU nodes. + +For GPU workloads, install the CUDA-enabled PyTorch wheel via `run`: + +```toml +[tool.jig.image] +run = ["pip install torch --index-url https://download.pytorch.org/whl/cu121"] +``` + +Do not list `torch` in `[project] dependencies` when using this approach -- the `run` +install handles it. Other packages that depend on torch (e.g. `openai-whisper`) will use the +already-installed CUDA build. + +Workers should also auto-detect the device to ease local testing: + +```python +device = "cuda" if torch.cuda.is_available() else "cpu" +model = load_model(device=device) +``` + +### `[tool.jig.deploy]` -- Runtime Settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| `description` | string | `""` | Human-readable description | +| `gpu_type` | string | `"h100-80gb"` | `"h100-80gb"` or `"none"` (CPU-only) | +| `gpu_count` | int | `1` | GPUs per replica | +| `cpu` | float | `1.0` | CPU cores per replica (supports fractional, e.g. `0.1`) | +| `memory` | float | `8.0` | Memory in GB (supports fractional, e.g. `0.5`) | +| `storage` | int | `100` | Ephemeral disk in GB | +| `min_replicas` | int | `1` | Min replicas (0 for scale-to-zero) | +| `max_replicas` | int | `1` | Max replicas | +| `port` | int | `8000` | Container listen port | +| `health_check_path` | string | `"/health"` | Health endpoint (must return 200 when ready) | +| `termination_grace_period_seconds` | int | `300` | Shutdown timeout for in-flight jobs | +| `command` | string[] | `null` | Override startup command at deploy time (e.g., `["python", "app.py", "--queue"]`) | + +### `[tool.jig.deploy.environment_variables]` + +Runtime environment variables injected into your container. For sensitive values, use secrets instead. + +```toml +[tool.jig.deploy.environment_variables] +MODEL_PATH = "/models/weights" +TORCH_COMPILE = "1" +LOG_LEVEL = "INFO" +``` + +### `[tool.jig.autoscaling]` + +> Not yet supported. The autoscaling config below is planned but the API currently rejects +> it with `unknown autoscaling metric`. Do not include `[tool.jig.autoscaling]` in your +> `pyproject.toml` until this feature is live. Use `min_replicas` / `max_replicas` under +> `[tool.jig.deploy]` for basic scaling control in the meantime. + +```toml +# NOT YET SUPPORTED -- will cause deployment failure +[tool.jig.autoscaling] +profile = "QueueBacklogPerWorker" +targetValue = "1.05" +``` + +### `[[tool.jig.volume_mounts]]` + +```toml +[[tool.jig.volume_mounts]] +name = "my-weights" +mount_path = "/models" +``` + +## Full Example + +```toml +[project] +name = "video-generator" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = ["diffusers", "sprocket"] + +[tool.jig.image] +python_version = "3.11" +system_packages = ["git", "ffmpeg", "libgl1"] +environment = { TORCH_CUDA_ARCH_LIST = "8.0 9.0" } +run = [ + "pip install torch --index-url https://download.pytorch.org/whl/cu121", + "pip install flash-attn --no-build-isolation", +] +cmd = "python app.py --queue" +copy = ["app.py", "models/"] + +[tool.jig.deploy] +description = "Video generation model" +gpu_type = "h100-80gb" +gpu_count = 2 +cpu = 8 +memory = 64 +min_replicas = 1 +max_replicas = 20 +port = 8000 +health_check_path = "/health" + +[tool.jig.deploy.environment_variables] +MODEL_PATH = "/models/weights" +TORCH_COMPILE = "1" + +[[tool.jig.volume_mounts]] +name = "my-weights" +mount_path = "/models" +``` + +## Container Registry + +- Host: `registry.together.xyz` +- Private to your organization +- Images referenced by digest for reproducibility +- Authentication handled automatically by Jig CLI + +## Debug Mode + +```shell +export TOGETHER_DEBUG=1 +together beta jig deploy +``` diff --git a/plugins/togetherai/skills/together-dedicated-containers/references/sprocket-sdk.md b/plugins/togetherai/skills/together-dedicated-containers/references/sprocket-sdk.md new file mode 100644 index 00000000..7d0ca3b8 --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-containers/references/sprocket-sdk.md @@ -0,0 +1,189 @@ +# Sprocket SDK Reference +## Contents + +- [Overview](#overview) +- [Installation](#installation) +- [Core Pattern](#core-pattern) +- [`sprocket.Sprocket` Base Class](#sprocketsprocket-base-class) +- [`sprocket.run(sprocket, name=None, use_torchrun=False)`](#sprocketrun) +- [`sprocket.FileOutput`](#sprocketfileoutput) +- [`sprocket.emit_info(info: dict)`](#sprocketemitinfo) +- [`sprocket.InputOutputProcessor`](#sprocketinputoutputprocessor) +- [HTTP Endpoints (Sprocket exposes)](#http-endpoints) +- [CLI Arguments](#cli-arguments) +- [Environment Variables](#environment-variables) +- [Multi-GPU Pattern](#multi-gpu-pattern) +- [Graceful Shutdown](#graceful-shutdown) + + +## Overview +Sprocket is the worker framework for Together Dedicated Containers. It handles job receiving, processing, and result reporting. + +## Installation + +```shell +pip install sprocket --extra-index-url https://pypi.together.ai/ +# or with Together's private PyPI in pyproject.toml: +[[tool.uv.index]] +name = "together-pypi" +url = "https://pypi.together.ai/" +``` + +## Core Pattern + +```python +import sprocket + +class MyModel(sprocket.Sprocket): + def setup(self) -> None: + """Called once at startup. Load models here.""" + self.model = load_model() + + def predict(self, args: dict) -> dict: + """Called for each job. Process and return results.""" + result = self.model(args["input"]) + return {"output": result} + + def shutdown(self) -> None: + """Optional. Called on graceful shutdown.""" + pass + +if __name__ == "__main__": + sprocket.run(MyModel()) +``` + +## `sprocket.Sprocket` Base Class + +### Methods + +| Method | Signature | Description | +|--------|-----------|-------------| +| `setup` | `setup(self) -> None` | Called once at startup. Load models and resources. | +| `predict` | `predict(self, args: dict) -> dict` | Called per job. Return results dict. | +| `shutdown` | `shutdown(self) -> None` | Optional. Cleanup on shutdown. | + +### Class Attributes + +| Attribute | Type | Default | Description | +|-----------|------|---------|-------------| +| `processor` | `Type[InputOutputProcessor]` | `InputOutputProcessor` | Custom I/O processor | +| `warmup_inputs` | `list[dict]` | `[]` | Inputs for cache warmup | + +## `sprocket.run(sprocket, name=None, use_torchrun=False)` + +| Parameter | Type | Description | +|-----------|------|-------------| +| `sprocket` | Sprocket | Your Sprocket instance | +| `name` | str \| None | Optional deployment/queue name. If omitted, Sprocket reads it from the `TOGETHER_DEPLOYMENT_NAME` environment variable that the platform injects at runtime. Pass explicitly only when you need to override the platform-provided name. | +| `use_torchrun` | bool | Enable multi-GPU mode. Default: False | + +## `sprocket.FileOutput` + +Wrap local files for automatic upload: + +```python +def predict(self, args): + video.save("output.mp4") + return {"video": sprocket.FileOutput("output.mp4"), "duration": 10.5} +``` + +The file is uploaded after `predict()` returns, and the path is replaced with an access URL in the final result. The URL is not publicly readable -- clients must authenticate with their Together API key (e.g. `Authorization: Bearer $TOGETHER_API_KEY`) when fetching it. + +## `sprocket.emit_info(info: dict)` + +> Availability note: `emit_info` is not available in all Sprocket versions. Guard calls +> with `hasattr(sprocket, "emit_info")` or wrap in a try/except. If unavailable, omit +> progress reporting -- jobs will still complete normally. + +Report progress from inside `predict()`: + +```python +def predict(self, args): + for i in range(100): + frame = generate_frame(i) + if hasattr(sprocket, "emit_info"): + sprocket.emit_info({"progress": (i + 1) / 100, "status": "generating"}) + return {"video": sprocket.FileOutput("output.mp4")} +``` + +Constraints: +- Must serialize to under 4,096 bytes JSON +- Updates batched and merged (later values overwrite earlier) +- With `use_torchrun=True`, call only from rank 0 + +## `sprocket.InputOutputProcessor` + +Custom I/O processing: + +```python +class CustomProcessor(sprocket.InputOutputProcessor): + def process_input_file(self, resp, dst): + """Custom download logic (e.g., decompression).""" + pass + + async def finalize(self, request_id, inputs, outputs): + """Post-processing after predict(), before FileOutput upload.""" + return outputs +``` + +## HTTP Endpoints (Sprocket exposes) + +| Endpoint | Method | Response | +|----------|--------|---------| +| `/health` | GET | `{"status": "healthy"}` (200) or `{"status": "unhealthy"}` (503) | +| `/metrics` | GET | Prometheus format: `requests_inflight 0.0` | +| `/generate` | POST | Direct HTTP inference (non-queue mode) | + +## CLI Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `--queue` | false | Enable queue worker mode | +| `--port` | 8000 | HTTP server port | + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `TOGETHER_API_KEY` | Required | API key | +| `TOGETHER_API_BASE_URL` | `https://api.together.ai` | API base URL | +| `TERMINATION_GRACE_PERIOD_SECONDS` | 300 | Shutdown + prediction timeout | +| `WORLD_SIZE` | 1 | GPU processes (set by torchrun) | + +## Multi-GPU Pattern + +```python +class MultiGPUModel(sprocket.Sprocket): + def setup(self): + import torch.distributed as dist + dist.init_process_group() + torch.cuda.set_device(dist.get_rank()) + self.model = load_model().to("cuda") + + def predict(self, args): + output = self.model(args["input"]) + if dist.get_rank() == 0: + save_output("result.mp4") + return {"video": sprocket.FileOutput("result.mp4")} + # Other ranks return None + +sprocket.run(MultiGPUModel(), use_torchrun=True) +``` + +Config for multi-GPU: +```toml +[tool.jig.deploy] +gpu_type = "h100-80gb" +gpu_count = 2 +``` + +When `use_torchrun=True` is passed to `sprocket.run()`, Sprocket launches torchrun internally. No need to override `cmd`. + +## Graceful Shutdown + +1. Container receives SIGTERM +2. Sprocket stops accepting new jobs +3. Current job runs to completion +4. `shutdown()` called +5. Container exits +6. Total time: `termination_grace_period_seconds` (default 300s) diff --git a/plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.py b/plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.py new file mode 100644 index 00000000..736859a7 --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Together AI Dedicated Containers - Queue Client (v2 SDK) + +Submit jobs, poll for results, and manage queue operations. + +Usage: + python queue_client.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key + export TOGETHER_DEPLOYMENT_NAME=your-deployment-name +""" + +import os +import time +from together import Together + +client = Together() + +DEPLOYMENT = os.environ.get("TOGETHER_DEPLOYMENT_NAME", "hello-world") + + +def submit_and_poll(payload: dict, priority: int = 1) -> dict: + """Submit a job and poll until completion.""" + job = client.beta.jig.queue.submit( + model=DEPLOYMENT, + payload=payload, + priority=priority, + ) + print(f"Submitted job: {job.request_id}") + + while True: + status = client.beta.jig.queue.retrieve( + request_id=job.request_id, + model=DEPLOYMENT, + ) + print(f" Status: {status.status}", end="") + + # Show progress if available + if hasattr(status, "info") and status.info: + progress = status.info.get("progress") + if progress is not None: + print(f" | Progress: {progress:.0%}", end="") + print() + + if status.status == "done": + print(f" Outputs: {status.outputs}") + return {"status": "done", "outputs": status.outputs} + elif status.status == "failed": + print(f" Error: {getattr(status, 'error', 'unknown')}") + return {"status": "failed", "error": getattr(status, "error", None)} + elif status.status == "canceled": + print(" Job was canceled") + return {"status": "canceled"} + + time.sleep(2) + + +def submit_multiple(payloads: list[dict]) -> list[str]: + """Submit multiple jobs and return their request IDs.""" + request_ids = [] + for payload in payloads: + job = client.beta.jig.queue.submit( + model=DEPLOYMENT, + payload=payload, + ) + request_ids.append(job.request_id) + print(f"Submitted: {job.request_id}") + return request_ids + + +def check_status(request_id: str) -> dict: + """Check the status of a single job.""" + status = client.beta.jig.queue.retrieve( + request_id=request_id, + model=DEPLOYMENT, + ) + print(f"Job {request_id}: {status.status}") + if status.status == "done": + print(f" Outputs: {status.outputs}") + return {"status": status.status, "outputs": getattr(status, "outputs", None)} + + +if __name__ == "__main__": + # --- Example 1: Submit and wait for result --- + print("=== Submit and poll ===") + result = submit_and_poll({"name": "Together"}) + print() + + # --- Example 2: Submit with priority --- + print("=== Priority job ===") + result = submit_and_poll({"name": "Priority User"}, priority=10) + print() + + # --- Example 3: Submit batch --- + print("=== Batch submit ===") + ids = submit_multiple([ + {"name": "Alice"}, + {"name": "Bob"}, + {"name": "Charlie"}, + ]) + print(f"Submitted {len(ids)} jobs") + + # Poll all + for rid in ids: + time.sleep(1) + check_status(rid) diff --git a/plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.ts b/plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.ts new file mode 100644 index 00000000..7f70f61d --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-containers/scripts/queue_client.ts @@ -0,0 +1,124 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Dedicated Containers - Queue Client (TypeScript SDK) + * + * Submit jobs, poll for results, and manage queue operations. + * + * Usage: + * npx tsx queue_client.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + * export TOGETHER_DEPLOYMENT_NAME=your-deployment-name + */ + +import Together from "together-ai"; + +const client = new Together(); + +const DEPLOYMENT = process.env.TOGETHER_DEPLOYMENT_NAME ?? "hello-world"; + +async function submitAndPoll( + payload: Record, + priority: number = 1 +): Promise { + const job = await client.beta.jig.queue.submit({ + model: DEPLOYMENT, + payload, + priority, + }); + const requestId = job.requestId; + if (!requestId) { + throw new Error("Queue submit response did not include a request id."); + } + console.log(`Submitted job: ${requestId}`); + + while (true) { + const status: any = await client.beta.jig.queue.retrieve({ + request_id: requestId, + model: DEPLOYMENT, + }); + + let line = ` Status: ${status.status}`; + if (status.info?.progress !== undefined) { + line += ` | Progress: ${(status.info.progress * 100).toFixed(0)}%`; + } + console.log(line); + + if (status.status === "done") { + console.log(" Outputs:", JSON.stringify(status.outputs)); + return { status: "done", outputs: status.outputs }; + } else if (status.status === "failed") { + console.log(" Error:", status.error); + return { status: "failed", error: status.error }; + } else if (status.status === "canceled") { + console.log(" Job was canceled"); + return { status: "canceled" }; + } + + await new Promise((r) => setTimeout(r, 2000)); + } +} + +async function submitMultiple( + payloads: Record[] +): Promise { + const requestIds: string[] = []; + for (const payload of payloads) { + const job = await client.beta.jig.queue.submit({ + model: DEPLOYMENT, + payload, + }); + const requestId = job.requestId; + if (!requestId) { + throw new Error("Queue submit response did not include a request id."); + } + requestIds.push(requestId); + console.log(`Submitted: ${requestId}`); + } + return requestIds; +} + +async function checkStatus(requestId: string): Promise { + const status: any = await client.beta.jig.queue.retrieve({ + request_id: requestId, + model: DEPLOYMENT, + }); + console.log(`Job ${requestId}: ${status.status}`); + if (status.status === "done") { + console.log(" Outputs:", JSON.stringify(status.outputs)); + } + return { status: status.status, outputs: status.outputs }; +} + +async function main() { + // --- Example 1: Submit and wait for result --- + console.log("=== Submit and poll ==="); + await submitAndPoll({ name: "Together" }); + console.log(); + + // --- Example 2: Submit with priority --- + console.log("=== Priority job ==="); + await submitAndPoll({ name: "Priority User" }, 10); + console.log(); + + // --- Example 3: Submit batch --- + console.log("=== Batch submit ==="); + const ids = await submitMultiple([ + { name: "Alice" }, + { name: "Bob" }, + { name: "Charlie" }, + ]); + console.log(`Submitted ${ids.length} jobs`); + + for (const rid of ids) { + await new Promise((r) => setTimeout(r, 1000)); + await checkStatus(rid); + } +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/plugins/togetherai/skills/together-dedicated-containers/scripts/sprocket_hello_world.py b/plugins/togetherai/skills/together-dedicated-containers/scripts/sprocket_hello_world.py new file mode 100644 index 00000000..460e2c01 --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-containers/scripts/sprocket_hello_world.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +""" +Together AI Dedicated Containers - Sprocket Hello World App + +A minimal Sprocket worker template. Deploy with `together beta jig deploy`. + +This file is the application entrypoint (app.py). Pair it with a +pyproject.toml configuration (see example below). + +Usage: + # Local test (requires sprocket installed) + python sprocket_hello_world.py + + # Deploy to Together + together beta jig deploy + +Requires: + pip install sprocket --extra-index-url https://pypi.together.ai/ + # For deployment: uv pip install "together>=2.0.0" + +Example pyproject.toml: + [project] + name = "my-sprocket-app" + version = "0.1.0" + + [tool.jig.image] + python_version = "3.11" + system_packages = [] + cmd = "python app.py --queue" + copy = ["app.py"] + + [tool.jig.deploy] + gpu_type = "h100-80gb" + gpu_count = 1 + min_replicas = 1 + max_replicas = 5 + port = 8000 + health_check_path = "/health" +""" + +import sprocket + + +class HelloModel(sprocket.Sprocket): + """Minimal Sprocket worker - echo input with transformation.""" + + def setup(self) -> None: + """Called once at startup. Load models, weights, or resources here.""" + # Example: self.model = load_my_model("weights/") + print("Model loaded and ready.") + + def predict(self, args: dict) -> dict: + """Called for each job. Process input and return output. + + Args: + args: Job payload dict (from queue submission or HTTP request). + + Returns: + dict: Response payload sent back to the caller. + """ + text = args.get("text", "") + operation = args.get("operation", "upper") + + if operation == "upper": + result = text.upper() + elif operation == "reverse": + result = text[::-1] + elif operation == "word_count": + result = str(len(text.split())) + else: + result = text + + return { + "input": text, + "operation": operation, + "result": result, + } + + def shutdown(self) -> None: + """Called on graceful shutdown. Optional cleanup.""" + print("Shutting down.") + + +if __name__ == "__main__": + sprocket.run(HelloModel()) diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/SKILL.md b/plugins/togetherai/skills/together-dedicated-endpoints/SKILL.md new file mode 100644 index 00000000..2c7c3a5e --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/SKILL.md @@ -0,0 +1,80 @@ +--- +name: together-dedicated-endpoints +description: "Single-tenant GPU endpoints on Together AI with autoscaling and no rate limits. Deploy fine-tuned or uploaded models, size hardware, and manage endpoint lifecycle. Reach for it whenever the user needs predictable always-on hosting rather than serverless inference, custom containers, or raw clusters." +--- + +# Together Dedicated Endpoints + +## Overview + +Use dedicated endpoints for managed single-tenant model hosting with predictable performance and +no shared serverless pool. + +Typical fits: + +- production inference with stable latency +- fine-tuned model hosting +- uploaded custom model hosting +- autoscaled model APIs + +## When This Skill Wins + +- The user needs always-on or single-tenant hosting +- The model is supported for dedicated deployment +- Fine-tuned or uploaded models must be served as endpoints +- Hardware, scaling, or idle-time settings need explicit control + +## Hand Off To Another Skill + +- Use `together-chat-completions` for serverless chat inference +- Use `together-dedicated-containers` for custom runtimes or nonstandard inference pipelines +- Use `together-gpu-clusters` for raw infrastructure or cluster orchestration + +## Quick Routing + +- Create and manage a standard endpoint + - Start with [scripts/manage_endpoint.py](scripts/manage_endpoint.py) or [scripts/manage_endpoint.ts](scripts/manage_endpoint.ts) + - Read [references/api-reference.md](references/api-reference.md) +- Lifecycle tuning or troubleshooting + - Read [references/api-reference.md](references/api-reference.md) +- Deploy a fine-tuned model + - Start with [scripts/deploy_finetuned.py](scripts/deploy_finetuned.py) + - Read [references/dedicated-models.md](references/dedicated-models.md) +- Upload and deploy a custom model + - Start with [scripts/upload_custom_model.py](scripts/upload_custom_model.py) + - Read [references/dedicated-models.md](references/dedicated-models.md) +- Hardware and sizing choices + - Read [references/hardware-options.md](references/hardware-options.md) + +## Workflow + +1. Confirm that the task needs dedicated hosting instead of serverless or containers. +2. Verify model eligibility and inspect available hardware. +3. Create the endpoint with explicit scaling and timeout settings. +4. Wait for readiness before sending inference traffic. +5. Stop or delete the endpoint when the workload no longer needs to run. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Model eligibility and hardware availability are gating constraints; check them early. +- Endpoint management uses endpoint IDs, while inference usually uses the endpoint name as `model`. +- Autoscaling, auto-shutdown, prompt caching, and speculative decoding materially affect operations and cost. +- For custom or fine-tuned models, do not skip the intermediate verification steps before deployment. + +## Resource Map + +- API reference: [references/api-reference.md](references/api-reference.md) +- Operational controls and troubleshooting: [references/api-reference.md](references/api-reference.md) +- Dedicated model guide: [references/dedicated-models.md](references/dedicated-models.md) +- Hardware guide: [references/hardware-options.md](references/hardware-options.md) +- Python endpoint lifecycle: [scripts/manage_endpoint.py](scripts/manage_endpoint.py) +- TypeScript endpoint lifecycle: [scripts/manage_endpoint.ts](scripts/manage_endpoint.ts) +- Fine-tuned deployment: [scripts/deploy_finetuned.py](scripts/deploy_finetuned.py) +- Custom model upload and deployment: [scripts/upload_custom_model.py](scripts/upload_custom_model.py) + +## Official Docs + +- [Dedicated Endpoints](https://docs.together.ai/docs/dedicated-inference) +- [Endpoints API](https://docs.together.ai/reference/createendpoint) +- [Upload and Deploy Custom Models](https://docs.together.ai/docs/custom-models) diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/references/api-reference.md b/plugins/togetherai/skills/together-dedicated-endpoints/references/api-reference.md new file mode 100644 index 00000000..8da0336f --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/references/api-reference.md @@ -0,0 +1,583 @@ +# Dedicated Endpoints API Reference +## Contents + +- [Endpoints](#endpoints) +- [Create Endpoint](#create-endpoint) +- [Get Endpoint](#get-endpoint) +- [List Endpoints](#list-endpoints) +- [Endpoint States](#endpoint-states) +- [Update Endpoint](#update-endpoint) +- [Start / Stop](#start-stop) +- [Delete](#delete) +- [List Hardware](#list-hardware) +- [Upload Model](#upload-model) +- [List Models](#list-models) +- [Using the Endpoint](#using-the-endpoint) +- [Auto-Shutdown](#auto-shutdown) +- [Speculative Decoding](#speculative-decoding) +- [Prompt Caching](#prompt-caching) +- [Availability Zones](#availability-zones) +- [Troubleshooting](#troubleshooting) +- [CLI Reference](#cli-reference) +- [Endpoint Response Object](#endpoint-response-object) + + +## Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /endpoints` | Create endpoint | Deploy a new dedicated endpoint | +| `GET /endpoints` | List endpoints | List all endpoints | +| `GET /endpoints/{id}` | Get endpoint | Get endpoint details | +| `PATCH /endpoints/{id}` | Update endpoint | Update config/scaling | +| `DELETE /endpoints/{id}` | Delete endpoint | Remove endpoint | +| `GET /hardware` | List hardware | Available hardware configs | +| `POST /models` | Upload model | Upload custom model | +| `GET /models` | List models | List available models | + +Base URL: `https://api.together.xyz/v1` + +## Create Endpoint + +```python +endpoint = client.endpoints.create( + model="Qwen/Qwen3.5-9B-FP8", + hardware="1x_nvidia_h100_80gb_sxm", + display_name="My Endpoint", + autoscaling={"min_replicas": 1, "max_replicas": 3}, + inactive_timeout=60, # minutes, 0 or None to disable +) +print(endpoint.id) # endpoint-abc123 +``` + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const endpoint = await together.endpoints.create({ + model: "Qwen/Qwen3.5-9B-FP8", + hardware: "1x_nvidia_h100_80gb_sxm", + autoscaling: { + min_replicas: 1, + max_replicas: 3, + }, +}); +console.log(endpoint.id); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/endpoints" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3.5-9B-FP8", + "hardware": "1x_nvidia_h100_80gb_sxm", + "display_name": "My Endpoint", + "autoscaling": { + "min_replicas": 1, + "max_replicas": 3 + } + }' +``` + +```shell +together endpoints create \ + --model Qwen/Qwen3.5-9B-FP8 \ + --hardware 1x_nvidia_h100_80gb_sxm \ + --display-name "My Endpoint" \ + --min-replicas 1 --max-replicas 3 \ + --wait +``` + +### Request Body + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `model` | string | Yes | - | Model to deploy | +| `hardware` | string | Yes | - | Hardware config ID | +| `autoscaling` | object | Yes | - | `{min_replicas, max_replicas}` | +| `display_name` | string | No | - | Human-readable name | +| `disable_speculative_decoding` | bool | No | false | Disable spec decoding | +| `state` | string | No | `"STARTED"` | `"STARTED"` or `"STOPPED"` | +| `inactive_timeout` | int/null | No | 60 | Minutes before auto-stop (0/null disables) | +| `availability_zone` | string | No | - | Preferred zone | + +## Get Endpoint + +```python +endpoint = client.endpoints.retrieve("endpoint-abc123") +print(endpoint.state) +``` + +```typescript +const endpoint = await together.endpoints.retrieve("endpoint-abc123"); +console.log(endpoint); +``` + +```shell +curl "https://api.together.xyz/v1/endpoints/endpoint-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" +``` + +```shell +together endpoints retrieve +together endpoints retrieve --json +``` + +## List Endpoints + +```python +response = client.endpoints.list() +for ep in response.data: + print(ep.id) +``` + +```typescript +const endpoints = await together.endpoints.list(); +for (const endpoint of endpoints.data) { + console.log(endpoint); +} +``` + +```shell +curl "https://api.together.xyz/v1/endpoints" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" + +# Filter by type and ownership +curl "https://api.together.xyz/v1/endpoints?type=dedicated&mine=true" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" +``` + +```shell +together endpoints list --mine +together endpoints list --type dedicated +together endpoints list --mine --type dedicated --usage-type on-demand +together endpoints list --json +``` + +### Query Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `type` | string | Filter by `dedicated` or `serverless` | +| `usage_type` | string | Filter by `on-demand` or `reserved` | +| `mine` | boolean | Only endpoints owned by the caller | + +## Endpoint States + +| State | Description | +|-------|-------------| +| `PENDING` | Waiting for resources | +| `STARTING` | Initializing | +| `STARTED` | Running, accepting requests | +| `STOPPING` | Shutting down | +| `STOPPED` | Not running | +| `ERROR` | Failed | + +## Update Endpoint + +```python +client.endpoints.update( + "endpoint-abc123", + autoscaling={"min_replicas": 2, "max_replicas": 5}, + display_name="Updated Name", +) +``` + +```typescript +await together.endpoints.update("endpoint-abc123", { + autoscaling: { min_replicas: 2, max_replicas: 5 }, + display_name: "Updated Name", +}); +``` + +```shell +curl -X PATCH "https://api.together.xyz/v1/endpoints/endpoint-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "autoscaling": { + "min_replicas": 2, + "max_replicas": 5 + }, + "display_name": "Updated Name" + }' +``` + +```shell +together endpoints update --min-replicas 2 --max-replicas 5 +together endpoints update --display-name "Updated Name" +``` + +### Updatable Fields + +- `display_name` +- `state` (`"STARTED"` or `"STOPPED"`) +- `autoscaling` +- `inactive_timeout` + +## Start / Stop + +```python +# Start +client.endpoints.update("endpoint-abc123", state="STARTED") + +# Stop +client.endpoints.update("endpoint-abc123", state="STOPPED") +``` + +```typescript +// Start +await together.endpoints.update("endpoint-abc123", { state: "STARTED" }); + +// Stop +await together.endpoints.update("endpoint-abc123", { state: "STOPPED" }); +``` + +```shell +# Start +curl -X PATCH "https://api.together.xyz/v1/endpoints/endpoint-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"state": "STARTED"}' + +# Stop +curl -X PATCH "https://api.together.xyz/v1/endpoints/endpoint-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"state": "STOPPED"}' +``` + +```shell +together endpoints start +together endpoints start --wait +together endpoints stop +together endpoints stop --wait +``` + +## Delete + +Returns HTTP 204 on success. + +```python +client.endpoints.delete("endpoint-abc123") +``` + +```typescript +await together.endpoints.delete("endpoint-abc123"); +``` + +```shell +curl -X DELETE "https://api.together.xyz/v1/endpoints/endpoint-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +```shell +together endpoints delete +``` + +## List Hardware + +```python +response = client.endpoints.list_hardware() +for hw in response.data: + print(hw.id) + +# Filter by model +response = client.endpoints.list_hardware(model="Qwen/Qwen3.5-9B-FP8") +for hw in response.data: + price = hw.pricing.cents_per_minute if hw.pricing else "N/A" + print(f"{hw.id}: {hw.specs.gpu_count}x {hw.specs.gpu_type} @ {price}c/min") +``` + +```typescript +const hardware = await together.endpoints.listHardware(); +console.log(hardware); +``` + +```shell +curl "https://api.together.xyz/v1/hardware" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" + +# Filter by model +curl "https://api.together.xyz/v1/hardware?model=Qwen/Qwen3.5-9B-FP8" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" +``` + +```shell +together endpoints hardware +together endpoints hardware --model Qwen/Qwen3.5-9B-FP8 +together endpoints hardware --model Qwen/Qwen3.5-9B-FP8 --available +together endpoints hardware --model Qwen/Qwen3.5-9B-FP8 --json +``` + +### Hardware Response Object + +```json +{ + "object": "hardware", + "id": "1x_nvidia_h100_80gb_sxm", + "pricing": { "cents_per_minute": 10.82 }, + "specs": { + "gpu_type": "h100", + "gpu_link": "sxm", + "gpu_memory": 80, + "gpu_count": 1 + }, + "availability": { "status": "available" }, + "updated_at": "2025-01-15T14:30:00Z" +} +``` + +## Upload Model + +```python +response = client.models.upload( + model_name="my-custom-model", + model_source="https://huggingface.co/your-org/your-model", + hf_token="hf_...", +) +print(response.data.job_id) +``` + +```typescript +const response = await client.models.upload({ + model_name: "my-custom-model", + model_source: "https://huggingface.co/your-org/your-model", +}); +console.log(response); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/models" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model_name": "my-custom-model", + "model_source": "https://huggingface.co/your-org/your-model", + "hf_token": "hf_..." + }' +``` + +```shell +together models upload \ + --model-name my-custom-model \ + --model-source https://huggingface.co/your-org/your-model \ + --hf-token $HF_TOKEN +``` + +### Upload Parameters + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `model_name` | string | Yes | Name for the uploaded model | +| `model_source` | string | Yes | Hugging Face repo URL or S3 presigned URL | +| `model_type` | string | No | `"model"` (default) or `"adapter"` | +| `hf_token` | string | No | Hugging Face token for private repos | +| `description` | string | No | Model description | +| `base_model` | string | No | Base model for adapters (serverless) | +| `lora_model` | string | No | LoRA pool for adapters (dedicated) | + +### Upload Response + +```json +{ + "data": { + "job_id": "job-b641db51-38e8-40f2-90a0-5353aeda6f21", + "model_name": "devuser/my-custom-model", + "model_source": "remote_archive" + }, + "message": "job created" +} +``` + +### Check Upload Status + +```shell +curl "https://api.together.xyz/v1/jobs/" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +## List Models + +```python +models = client.models.list() +for model in models: + print(model.id) +``` + +```shell +# All models +curl "https://api.together.xyz/v1/models" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" + +# Dedicated-eligible models only +curl "https://api.together.xyz/v1/models?dedicated=true" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" +``` + +```shell +together models list +together models list --type dedicated +together models list --json +``` + +## Using the Endpoint + +Once STARTED, use the Chat Completions API with either the endpoint name or ID. For +management calls, use the endpoint ID. For inference, prefer the endpoint name once the deployment +is stable: + +```python +response = client.chat.completions.create( + model="endpoint-abc123", # or endpoint name + messages=[{"role": "user", "content": "Hello!"}], +) +``` + +```typescript +const response = await together.chat.completions.create({ + model: "endpoint-abc123", // or endpoint name + messages: [{ role: "user", content: "Hello!" }], +}); +console.log(response.choices[0].message.content); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/chat/completions" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "endpoint-abc123", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + +## Auto-Shutdown + +Endpoints auto-stop after 1 hour of inactivity by default. Set `inactive_timeout` in minutes to +change the behavior. Use `0` or `null` to disable auto-shutdown entirely. + +## Speculative Decoding + +Speculative decoding is controlled by `disable_speculative_decoding`. Leave it enabled for general +throughput-oriented workloads. Disable it when tail latency matters more than average throughput. + +## Prompt Caching + +Prompt caching is enabled for dedicated endpoints and reduces latency on repeated prompt prefixes. +Treat it as a default performance optimization rather than an optional advanced feature. + +## Availability Zones + +```shell +together endpoints availability-zones +together endpoints create --availability-zone us-central-4b ... +``` + +Only constrain availability zones when the workload has real geography or latency requirements. +Restricting zones narrows available capacity and can make hardware placement harder. + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Hardware unavailable | Try a different compatible model or retry when capacity changes | +| Hardware not eligible (404: "not available for this model") | The model only supports specific hardware configs. Run `list_hardware(model=...)` to see eligible options. Fine-tuned models often require larger hardware than their parameter count suggests | +| Endpoint queued (not starting) | Reduce `min_replicas` to match currently available capacity | +| Low replica scaling | Reduce `max_replicas` or wait for more hardware to become available | +| Model not supported | Use a dedicated-eligible model from `together models list --type dedicated` | +| Fine-tuned model won't deploy | Confirm the base model is supported on dedicated endpoints | + +## CLI Reference + +### Endpoint Commands + +| Command | Description | +|---------|-------------| +| `together endpoints create` | Create a new endpoint | +| `together endpoints retrieve ` | Get endpoint details | +| `together endpoints list` | List endpoints | +| `together endpoints update ` | Update endpoint config | +| `together endpoints start ` | Start a stopped endpoint | +| `together endpoints stop ` | Stop a running endpoint | +| `together endpoints delete ` | Delete an endpoint | +| `together endpoints hardware` | List available hardware | +| `together endpoints availability-zones` | List availability zones | + +### Model Commands + +| Command | Description | +|---------|-------------| +| `together models upload` | Upload a custom model | +| `together models list` | List available models | + +### Create Options + +| Flag | Description | +|------|-------------| +| `--model` | (required) Model to deploy | +| `--hardware` | (required) Hardware config ID | +| `--min-replicas` | Minimum replica count | +| `--max-replicas` | Maximum replica count | +| `--display-name` | Human-readable name | +| `--no-auto-start` | Create in STOPPED state | +| `--no-speculative-decoding` | Disable speculative decoding | +| `--availability-zone` | Preferred availability zone | +| `--wait` | Wait for endpoint to be ready | +| `--json` | Output in JSON format | + +### List Options + +| Flag | Description | +|------|-------------| +| `--mine` | Show only your endpoints | +| `--type` | Filter by `dedicated` or `serverless` | +| `--usage-type` | Filter by `on-demand` or `reserved` | +| `--json` | Output in JSON format | + +### Hardware Options + +| Flag | Description | +|------|-------------| +| `--model` | Filter by model compatibility | +| `--available` | Show only available hardware (requires `--model`) | +| `--json` | Output in JSON format | + +### Upload Options + +| Flag | Description | +|------|-------------| +| `--model-name` | (required) Name for the uploaded model | +| `--model-source` | (required) HF repo URL or S3 presigned URL | +| `--model-type` | `model` or `adapter` | +| `--hf-token` | Hugging Face API token | +| `--description` | Model description | +| `--base-model` | Base model for adapters (serverless) | +| `--lora-model` | LoRA pool for adapters (dedicated) | +| `--json` | Output in JSON format | + +## Endpoint Response Object + +```json +{ + "object": "endpoint", + "id": "endpoint-d23901de-ef8f-44bf-b3e7-de9c1ca8f2d7", + "name": "devuser/Qwen/Qwen3.5-9B-FP8-a32b82a1", + "display_name": "My Endpoint", + "model": "Qwen/Qwen3.5-9B-FP8", + "hardware": "1x_nvidia_h100_80gb_sxm", + "type": "dedicated", + "owner": "devuser", + "state": "STARTED", + "autoscaling": { "min_replicas": 1, "max_replicas": 3 }, + "created_at": "2025-02-04T10:43:55.405Z" +} +``` diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/references/dedicated-models.md b/plugins/togetherai/skills/together-dedicated-endpoints/references/dedicated-models.md new file mode 100644 index 00000000..c3dbb552 --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/references/dedicated-models.md @@ -0,0 +1,150 @@ +# Dedicated Endpoints Model Reference +## Contents + +- [Chat Models](#chat-models) +- [Image Models](#image-models) +- [Transcription Models](#transcription-models) +- [Moderation Models](#moderation-models) +- [Rerank Models](#rerank-models) +- [Custom and Fine-tuned Models](#custom-and-fine-tuned-models) + + +Models available for deployment on dedicated endpoints. This list changes frequently -- use +`together models list --type dedicated` for the current catalog. + +## Chat Models + +| Model | API ID | Context | +|-------|--------|---------| +| DeepSeek R1-0528 | deepseek-ai/DeepSeek-R1 | 163,840 | +| DeepSeek R1 Distill Llama 70B | deepseek-ai/DeepSeek-R1-Distill-Llama-70B | 131,072 | +| DeepSeek R1 Distill Qwen 14B | deepseek-ai/DeepSeek-R1-Distill-Qwen-14B | 131,072 | +| DeepSeek V3-0324 | deepseek-ai/DeepSeek-V3 | 131,072 | +| DeepSeek V3.1 | deepseek-ai/DeepSeek-V3.1 | 131,072 | +| LLaMA-2 70B | meta-llama/Llama-2-70b-hf | 4,096 | +| Llama 3.1 405B Instruct | meta-llama/Llama-3.1-405B-Instruct | 4,096 | +| Llama 3.2 1B Instruct | meta-llama/Llama-3.2-1B-Instruct | 131,072 | +| Llama 3.3 70B Instruct Turbo | meta-llama/Llama-3.3-70B-Instruct-Turbo | 131,072 | +| Llama 4 Maverick 17Bx128E | meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 | 1,048,576 | +| Llama 4 Scout 17Bx16E | meta-llama/Llama-4-Scout-17B-16E-Instruct | 1,048,576 | +| Meta Llama 3 70B Instruct Turbo | meta-llama/Meta-Llama-3-70B-Instruct-Turbo | 8,192 | +| Meta Llama 3 8B Instruct | meta-llama/Meta-Llama-3-8B-Instruct | 8,192 | +| Mistral 7B Instruct v0.1 | mistralai/Mistral-7B-Instruct-v0.1 | 32,768 | +| Mistral 7B Instruct v0.2 | mistralai/Mistral-7B-Instruct-v0.2 | 32,768 | +| Mistral 7B Instruct v0.3 | mistralai/Mistral-7B-Instruct-v0.3 | 32,768 | +| Mixtral-8x7B Instruct v0.1 | mistralai/Mixtral-8x7B-Instruct-v0.1 | 32,768 | +| OpenAI GPT-OSS 120B | openai/gpt-oss-120b | 131,072 | +| OpenAI GPT-OSS 20B | openai/gpt-oss-20b | 131,072 | +| Qwen2.5 72B Instruct | Qwen/Qwen2.5-72B-Instruct | 32,768 | +| Qwen2.5 72B Instruct Turbo | Qwen/Qwen2.5-72B-Instruct-Turbo | 131,072 | +| Qwen2.5 7B Instruct Turbo | Qwen/Qwen2.5-7B-Instruct-Turbo | 32,768 | +| Qwen2.5 Coder 32B Instruct | Qwen/Qwen2.5-Coder-32B-Instruct | 16,384 | +| Qwen2.5-VL 72B Instruct | Qwen/Qwen2.5-VL-72B-Instruct | 32,768 | +| Qwen3 235B A22B FP8 | Qwen/Qwen3-235B-A22B-fp8-tput | 40,960 | +| Qwen3 Coder 480B A35B FP8 | Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 | 262,144 | +| Qwen3 Next 80B A3B | Qwen/Qwen3-Next-80B-A3B-Instruct | 262,144 | +| QwQ-32B | Qwen/QwQ-32B | 131,072 | +| GLM-4.5 Air FP8 | zai-org/GLM-4.5-Air-FP8 | 131,072 | + +## Image Models + +| Model | API ID | +|-------|--------| +| FLUX.1 Kontext [max] | black-forest-labs/FLUX.1-kontext-max | +| FLUX.1 Kontext [pro] | black-forest-labs/FLUX.1-kontext-pro | + +## Transcription Models + +| Model | API ID | +|-------|--------| +| Whisper large-v3 | openai/whisper-large-v3 | + +## Moderation Models + +| Model | API ID | Context | +|-------|--------|---------| +| Llama Guard 4 12B | meta-llama/Llama-Guard-4-12B | 1,048,576 | + +## Rerank Models + +| Model | API ID | Context | +|-------|--------|---------| +| Llama Rank V1 | Salesforce/Llama-Rank-V1 | 8,192 | + +## Custom and Fine-tuned Models + +Custom uploaded models and fine-tuned models can also be deployed on dedicated endpoints. + +### Requirements + +- Format: Hugging Face-compatible (`config.json`, tokenizer files, safetensors) +- Types: Text generation and embedding models +- Scale: Must fit on a single node (multi-node not supported) +- Fine-tuned: Base model must be a supported dedicated endpoint model +- Sources: Hugging Face Hub or S3 (`.zip`, `.tar`, `.tar.gz`) + +### Upload Custom Model + +```python +from together import Together +client = Together() + +# From Hugging Face +response = client.models.upload( + model_name="my-custom-model", + model_source="https://huggingface.co/your-org/your-model", + hf_token="hf_...", +) +print(response.data.job_id) + +# From S3 (presigned URL, at least 100 min validity) +response = client.models.upload( + model_name="my-s3-model", + model_source="https://my-bucket.s3.amazonaws.com/model.tar.gz?...", +) +``` + +```shell +# From Hugging Face +together models upload \ + --model-name my-custom-model \ + --model-source https://huggingface.co/your-org/your-model \ + --hf-token $HF_TOKEN + +# From S3 +together models upload \ + --model-name my-s3-model \ + --model-source "$PRESIGNED_URL" +``` + +### Deploy Custom or Fine-tuned Model + +```shell +# Verify model appears +together models list + +# Check hardware options +together endpoints hardware --model + +# Deploy +together endpoints create \ + --model \ + --hardware 2x_nvidia_h100_80gb_sxm \ + --display-name "Custom Model Endpoint" \ + --no-speculative-decoding \ + --wait +``` + +### Deploy Fine-tuned Model + +```shell +# Find model output name from fine-tuning job +together fine-tuning list + +# Deploy +together endpoints create \ + --model \ + --hardware 4x_nvidia_h100_80gb_sxm \ + --display-name "Fine-tuned Endpoint" \ + --wait +``` diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/references/hardware-options.md b/plugins/togetherai/skills/together-dedicated-endpoints/references/hardware-options.md new file mode 100644 index 00000000..2617eddf --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/references/hardware-options.md @@ -0,0 +1,165 @@ +# Dedicated Endpoints Hardware Reference +## Contents + +- [Hardware ID Format](#hardware-id-format) +- [GPU Types](#gpu-types) +- [Common Configurations](#common-configurations) +- [Hardware Availability Status](#hardware-availability-status) +- [Hardware Response Object](#hardware-response-object) +- [Pricing Model](#pricing-model) +- [GPU Selection Guide](#gpu-selection-guide) +- [Scaling](#scaling) +- [Autoscaling Schema](#autoscaling-schema) + + +## Hardware ID Format + +`[count]x_nvidia_[gpu_type]_[memory]_[link]` + +Example: `2x_nvidia_h100_80gb_sxm` + +## GPU Types + +Currently offered hardware families: + +| GPU | Memory | Notes | +|-----|--------|-------| +| H100 SXM | 80GB | Production workhorse, broad model coverage | +| H200 SXM | 140GB | Larger HBM than H100 for memory-bound workloads | +| B200 SXM | 180GB | Highest performance, largest single-GPU memory | + +A100, L40, L40S, and RTX 6000 are no longer offered for new dedicated endpoints. The `/v1/hardware` +endpoint may still return deprecated SKUs; treat only H100, H200, and B200 as deployable. + +Hardware availability varies by region and demand. Use the API or CLI to get current options: + +```python +from together import Together +client = Together() + +response = client.endpoints.list_hardware(model="Qwen/Qwen3.5-9B-FP8") +for hw in response.data: + status = hw.availability.status if hw.availability else "unknown" + price = hw.pricing.cents_per_minute if hw.pricing else "N/A" + print(f" {hw.id} ({status}, {price}c/min)") +``` + +```shell +together endpoints hardware --model Qwen/Qwen3.5-9B-FP8 --available +``` + +## Common Configurations + +| Hardware ID | GPU | Count | Typical Use | +|------------|-----|-------|-------------| +| `1x_nvidia_h100_80gb_sxm` | H100 | 1 | Small models (up to ~9B) | +| `2x_nvidia_h100_80gb_sxm` | H100 | 2 | Medium models (7-20B) | +| `4x_nvidia_h100_80gb_sxm` | H100 | 4 | Large models (70B) | +| `8x_nvidia_h100_80gb_sxm` | H100 | 8 | Very large models (120B+, MoE) | +| `1x_nvidia_h200_140gb_sxm` | H200 | 1 | Memory-bound small/medium models | +| `4x_nvidia_h200_140gb_sxm` | H200 | 4 | Large models with bigger KV cache | +| `8x_nvidia_h200_140gb_sxm` | H200 | 8 | Very large or long-context models | +| `1x_nvidia_b200_180gb_sxm` | B200 | 1 | Highest single-GPU performance | +| `8x_nvidia_b200_180gb_sxm` | B200 | 8 | Maximum throughput / largest models | + +## Hardware Availability Status + +| Status | Meaning | +|--------|---------| +| `available` | Ready for deployment | +| `unavailable` | Currently not available | +| `insufficient` | Some capacity but may be limited | + +## Hardware Response Object + +```json +{ + "object": "hardware", + "id": "1x_nvidia_h100_80gb_sxm", + "pricing": { "cents_per_minute": 10.82 }, + "specs": { + "gpu_type": "h100", + "gpu_link": "sxm", + "gpu_memory": 80, + "gpu_count": 1 + }, + "availability": { "status": "available" }, + "updated_at": "2025-01-15T14:30:00Z" +} +``` + +## Pricing Model + +- Billed per minute while endpoint is running (even when idle) +- No charge during spin-up or for failed deployments +- Stop endpoint to pause charges +- Price varies by hardware configuration (check `cents_per_minute`) + +### Single-GPU on-demand rates + +Reference prices for the currently-offered single-GPU SKUs (multiply by GPU count for multi-GPU +configurations of the same family; for the authoritative live rates always call the API or CLI): + +| Hardware ID | Cost/hour | +|-------------|-----------| +| `1x_nvidia_h100_80gb_sxm` | $6.49 | +| `1x_nvidia_h200_140gb_sxm` | $7.89 | +| `1x_nvidia_b200_180gb_sxm` | $11.95 | + +Multi-GPU hardware IDs share the single-GPU suffix, e.g. four H100s use `4x_nvidia_h100_80gb_sxm`. +Cost scales linearly with the GPU count. + +Each running replica bills independently and stops billing as soon as it is scaled down. Run +`together endpoints hardware --model ` (or `tg endpoints hardware --model `) +for the per-model list with current per-minute rates. + +## GPU Selection Guide + +| Need | Recommendation | +|------|---------------| +| Small models (up to 9B) | 1x H100 | +| Medium models (7-20B) | 1-2x H100 | +| Large models (70B) | 4-8x H100 or 4x H200 | +| Very large / MoE models (120B+) | 8x H100, 8x H200, or 8x B200 | +| Maximum throughput | 8x B200 + multiple replicas | +| Cost-effective baseline | H100 (lowest per-hour rate of currently-offered SKUs) | +| Long-context / memory-bound | H200 or B200 (larger HBM) | +| Maximum performance | B200 (newest generation, highest single-GPU speed) | + +Fine-tuned and custom-uploaded models may require larger hardware than their base parameter count +suggests. For example, a fine-tuned 8B model may only be eligible for 4x or 8x H100 configs. +Always call `list_hardware(model=...)` to get the authoritative list of eligible hardware before +creating an endpoint: + +```python +response = client.endpoints.list_hardware(model="your-username/your-finetuned-model") +for hw in response.data: + status = hw.availability.status if hw.availability else "unknown" + print(f" {hw.id} ({status})") +``` + +## Scaling + +### Horizontal (Replicas) + +- Increases maximum QPS +- Linear cost scaling +- Best for high-concurrency workloads + +### Vertical (GPU Count) + +- Increases generation speed +- Reduces time-to-first-token +- Best for latency-sensitive workloads + +## Autoscaling Schema + +```json +{ + "min_replicas": 1, + "max_replicas": 5 +} +``` + +- `min_replicas`: Always running (even with no traffic) +- `max_replicas`: Maximum under load diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/scripts/deploy_finetuned.py b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/deploy_finetuned.py new file mode 100644 index 00000000..e843d4e2 --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/deploy_finetuned.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +""" +Together AI -- Deploy a Fine-tuned Model on a Dedicated Endpoint (v2 SDK) + +Deploy a fine-tuned model as a dedicated endpoint, wait for it to become +ready, and optionally run inference or tear down the endpoint. + +Fine-tuned models may require larger hardware than the base parameter count +suggests (e.g. 4x H100 for an 8B model). The script validates the chosen +hardware against eligible configs before creating the endpoint. + +Usage: + python deploy_finetuned.py list-jobs + python deploy_finetuned.py deploy --model-name your-username/Qwen3-8B-your-suffix --hardware 4x_nvidia_h100_80gb_sxm + python deploy_finetuned.py deploy --model-name your-username/Qwen3-8B-your-suffix --hardware 4x_nvidia_h100_80gb_sxm --delete + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import sys +import time + +from together import Together + +client = Together() + + +def list_finetuning_jobs() -> list: + """List recent fine-tuning jobs to help locate model output names.""" + jobs = client.fine_tuning.list() + for job in jobs.data: + status = job.status + model = getattr(job, "fine_tuned_model", None) or getattr(job, "output_name", None) or "pending" + print(f" {job.id}: {status} model={model}") + return jobs.data + + +def list_hardware(model_name: str) -> list: + """List available hardware for a fine-tuned model.""" + response = client.endpoints.list_hardware(model=model_name) + for hw in response.data: + status = hw.availability.status if hw.availability else "unknown" + print(f" {hw.id} ({status})") + return response.data + + +def validate_hardware(model_name: str, hardware: str) -> None: + """Check that the chosen hardware is eligible for the model. Exits on mismatch.""" + print(f"Validating hardware for {model_name}...") + response = client.endpoints.list_hardware(model=model_name) + eligible_ids = [hw.id for hw in response.data] + for hw in response.data: + status = hw.availability.status if hw.availability else "unknown" + tag = " <-- selected" if hw.id == hardware else "" + print(f" {hw.id} ({status}){tag}") + + if hardware not in eligible_ids: + print( + f"\nError: '{hardware}' is not eligible for this model.\n" + f"Eligible options: {', '.join(eligible_ids)}" + ) + sys.exit(1) + + +def deploy_finetuned( + model_name: str, + hardware: str, + display_name: str | None = None, + min_replicas: int = 1, + max_replicas: int = 1, +): + """Deploy a fine-tuned model on a dedicated endpoint.""" + endpoint = client.endpoints.create( + model=model_name, + hardware=hardware, + autoscaling={"min_replicas": min_replicas, "max_replicas": max_replicas}, + display_name=display_name, + ) + print(f"Created endpoint: {endpoint.id} (state: {endpoint.state})") + print(f" Endpoint name (for inference): {endpoint.name}") + return endpoint + + +def wait_for_ready(endpoint_id: str, timeout: int = 600, poll_interval: int = 15): + """Poll until endpoint reaches STARTED state.""" + elapsed = 0 + while elapsed < timeout: + endpoint = client.endpoints.retrieve(endpoint_id) + print(f" State: {endpoint.state} ({elapsed}s)") + + if endpoint.state == "STARTED": + return endpoint + if endpoint.state == "ERROR": + raise RuntimeError(f"Endpoint entered ERROR state: {endpoint_id}") + + time.sleep(poll_interval) + elapsed += poll_interval + + raise TimeoutError(f"Endpoint not ready after {timeout}s") + + +def run_inference(endpoint_name: str, prompt: str) -> str: + """Send a chat completion to the fine-tuned model endpoint.""" + response = client.chat.completions.create( + model=endpoint_name, + messages=[{"role": "user", "content": prompt}], + max_tokens=200, + ) + reply = response.choices[0].message.content + print(f"Response: {reply}") + return reply + + +def stop_endpoint(endpoint_id: str) -> None: + """Stop an endpoint to avoid charges. Can be restarted later.""" + client.endpoints.update(endpoint_id, state="STOPPED") + print(f"Stopped endpoint: {endpoint_id}") + + +def delete_endpoint(endpoint_id: str) -> None: + """Permanently delete an endpoint.""" + client.endpoints.delete(endpoint_id) + print(f"Deleted endpoint: {endpoint_id}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Deploy a fine-tuned model on Together AI") + subparsers = parser.add_subparsers(dest="command", required=True) + + subparsers.add_parser("list-jobs", help="List recent fine-tuning jobs") + + deploy_parser = subparsers.add_parser("deploy", help="Deploy a fine-tuned model") + deploy_parser.add_argument("--model-name", required=True, help="Fine-tuned model name to deploy") + deploy_parser.add_argument("--hardware", required=True, help="Hardware id for the endpoint") + deploy_parser.add_argument("--display-name", help="Optional endpoint display name") + deploy_parser.add_argument("--min-replicas", type=int, default=1, help="Minimum replicas") + deploy_parser.add_argument("--max-replicas", type=int, default=1, help="Maximum replicas") + deploy_parser.add_argument( + "--skip-hardware-check", + action="store_true", + help="Skip validating hardware eligibility before creating", + ) + deploy_parser.add_argument("--timeout", type=int, default=600, help="Maximum wait time in seconds") + deploy_parser.add_argument("--poll-interval", type=int, default=15, help="Seconds between polls") + deploy_parser.add_argument( + "--prompt", + default="What are some fun things to do in New York?", + help="Prompt to use after the endpoint becomes ready", + ) + + teardown = deploy_parser.add_mutually_exclusive_group() + teardown.add_argument( + "--leave-running", + action="store_true", + help="Leave the endpoint running after the test prompt", + ) + teardown.add_argument( + "--delete", + action="store_true", + help="Delete the endpoint after testing (default: stop only)", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if args.command == "list-jobs": + list_finetuning_jobs() + return + + if not args.skip_hardware_check: + validate_hardware(args.model_name, args.hardware) + + endpoint = deploy_finetuned( + model_name=args.model_name, + hardware=args.hardware, + display_name=args.display_name, + min_replicas=args.min_replicas, + max_replicas=args.max_replicas, + ) + endpoint = wait_for_ready(endpoint.id, timeout=args.timeout, poll_interval=args.poll_interval) + run_inference(endpoint.name, args.prompt) + + if args.leave_running: + print(f"Endpoint left running: {endpoint.id}") + elif args.delete: + delete_endpoint(endpoint.id) + else: + stop_endpoint(endpoint.id) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.py b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.py new file mode 100644 index 00000000..189d91bb --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +""" +Together AI Dedicated Endpoints -- Create, Monitor, Use, Stop (v2 SDK) + +Full lifecycle: list hardware, create endpoint, wait for ready, +run inference, then stop or delete. + +Usage: + python manage_endpoint.py list-hardware --model Qwen/Qwen3.5-9B-FP8 + python manage_endpoint.py create --model Qwen/Qwen3.5-9B-FP8 --hardware 1x_nvidia_h100_80gb_sxm + python manage_endpoint.py demo + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import time + +from together import Together + +client = Together() + + +def list_hardware(model: str | None = None) -> list: + """List available hardware options, optionally filtered by model.""" + response = client.endpoints.list_hardware(model=model) + for hw in response.data: + status = hw.availability.status if hw.availability else "unknown" + price = hw.pricing.cents_per_minute if hw.pricing else "N/A" + print(f" {hw.id} ({status}, {price}c/min)") + return response.data + + +def list_endpoints() -> list: + """List all endpoints owned by the caller.""" + response = client.endpoints.list() + for endpoint in response.data: + print(f" {endpoint.id}: {endpoint.model} ({endpoint.state})") + return response.data + + +def create_endpoint( + model: str, + hardware: str, + min_replicas: int = 1, + max_replicas: int = 1, + display_name: str | None = None, + inactive_timeout: int | None = 60, +): + """Create a dedicated endpoint.""" + endpoint = client.endpoints.create( + model=model, + hardware=hardware, + autoscaling={ + "min_replicas": min_replicas, + "max_replicas": max_replicas, + }, + display_name=display_name, + inactive_timeout=inactive_timeout, + ) + print(f"Created endpoint: {endpoint.id} (state: {endpoint.state})") + print(f" Endpoint name (for inference): {endpoint.name}") + return endpoint + + +def wait_for_ready(endpoint_id: str, timeout: int = 600, poll_interval: int = 10): + """Poll until endpoint reaches STARTED state.""" + elapsed = 0 + while elapsed < timeout: + endpoint = client.endpoints.retrieve(endpoint_id) + print(f" State: {endpoint.state} ({elapsed}s)") + + if endpoint.state == "STARTED": + return endpoint + if endpoint.state == "ERROR": + raise RuntimeError(f"Endpoint entered ERROR state: {endpoint_id}") + + time.sleep(poll_interval) + elapsed += poll_interval + + raise TimeoutError(f"Endpoint not ready after {timeout}s") + + +def run_inference(endpoint_name: str, prompt: str) -> str: + """Send a chat completion to the dedicated endpoint.""" + response = client.chat.completions.create( + model=endpoint_name, + messages=[{"role": "user", "content": prompt}], + max_tokens=200, + ) + reply = response.choices[0].message.content + print(f"Response: {reply}") + return reply + + +def stop_endpoint(endpoint_id: str): + """Stop but do not delete an endpoint.""" + endpoint = client.endpoints.update(endpoint_id, state="STOPPED") + print(f"Stopped endpoint: {endpoint.id} (state: {endpoint.state})") + return endpoint + + +def delete_endpoint(endpoint_id: str) -> None: + """Permanently delete an endpoint.""" + client.endpoints.delete(endpoint_id) + print(f"Deleted endpoint: {endpoint_id}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI dedicated endpoint management") + subparsers = parser.add_subparsers(dest="command", required=True) + + list_hardware_parser = subparsers.add_parser("list-hardware", help="List available hardware") + list_hardware_parser.add_argument("--model", help="Optional model filter") + + subparsers.add_parser("list-endpoints", help="List existing endpoints") + + create_parser = subparsers.add_parser("create", help="Create a dedicated endpoint") + create_parser.add_argument("--model", required=True, help="Model name to deploy") + create_parser.add_argument("--hardware", required=True, help="Hardware id to use") + create_parser.add_argument("--min-replicas", type=int, default=1, help="Minimum replicas") + create_parser.add_argument("--max-replicas", type=int, default=1, help="Maximum replicas") + create_parser.add_argument("--display-name", help="Optional display name") + create_parser.add_argument("--inactive-timeout", type=int, default=60, help="Inactive timeout in minutes") + + wait_parser = subparsers.add_parser("wait", help="Wait for an endpoint to become ready") + wait_parser.add_argument("--endpoint-id", required=True, help="Endpoint id") + wait_parser.add_argument("--timeout", type=int, default=600, help="Maximum wait time in seconds") + wait_parser.add_argument("--poll-interval", type=int, default=10, help="Seconds between polls") + + infer_parser = subparsers.add_parser("infer", help="Run inference against an endpoint") + infer_parser.add_argument("--model-name", required=True, help="Endpoint model name to query") + infer_parser.add_argument("--prompt", required=True, help="Prompt to send") + + stop_parser = subparsers.add_parser("stop", help="Stop an endpoint") + stop_parser.add_argument("--endpoint-id", required=True, help="Endpoint id") + + delete_parser = subparsers.add_parser("delete", help="Delete an endpoint") + delete_parser.add_argument("--endpoint-id", required=True, help="Endpoint id") + + demo_parser = subparsers.add_parser("demo", help="Run the full example flow") + demo_parser.add_argument("--model", default="Qwen/Qwen3.5-9B-FP8", help="Model name") + demo_parser.add_argument("--hardware", default="1x_nvidia_h100_80gb_sxm", help="Hardware id") + demo_parser.add_argument("--display-name", default="My Qwen Endpoint", help="Endpoint display name") + demo_parser.add_argument( + "--prompt", + default="What is the capital of France?", + help="Prompt for the inference step", + ) + demo_parser.add_argument("--timeout", type=int, default=600, help="Maximum wait time in seconds") + demo_parser.add_argument("--poll-interval", type=int, default=10, help="Seconds between polls") + demo_parser.add_argument( + "--delete", + action="store_true", + help="Delete the endpoint at the end instead of leaving it stopped", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if args.command == "list-hardware": + list_hardware(model=args.model) + return + if args.command == "list-endpoints": + list_endpoints() + return + if args.command == "create": + create_endpoint( + model=args.model, + hardware=args.hardware, + min_replicas=args.min_replicas, + max_replicas=args.max_replicas, + display_name=args.display_name, + inactive_timeout=args.inactive_timeout, + ) + return + if args.command == "wait": + wait_for_ready(args.endpoint_id, timeout=args.timeout, poll_interval=args.poll_interval) + return + if args.command == "infer": + run_inference(args.model_name, args.prompt) + return + if args.command == "stop": + stop_endpoint(args.endpoint_id) + return + if args.command == "delete": + delete_endpoint(args.endpoint_id) + return + + print("Available hardware:") + list_hardware(model=args.model) + print("\nYour endpoints:") + list_endpoints() + endpoint = create_endpoint( + model=args.model, + hardware=args.hardware, + display_name=args.display_name, + ) + endpoint = wait_for_ready(endpoint.id, timeout=args.timeout, poll_interval=args.poll_interval) + run_inference(endpoint.name, args.prompt) + if args.delete: + delete_endpoint(endpoint.id) + else: + stop_endpoint(endpoint.id) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.ts b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.ts new file mode 100644 index 00000000..52af1604 --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/manage_endpoint.ts @@ -0,0 +1,129 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Dedicated Endpoints -- Create, Monitor, Use, Stop + * + * Full lifecycle: list hardware, create endpoint, wait for ready, + * run inference, then stop/delete. + * + * Usage: + * npx tsx manage_endpoint.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +async function listHardware(model?: string): Promise { + console.log("=== Available Hardware ==="); + const response = await client.endpoints.listHardware( + model ? { model } : undefined, + ); + for (const hw of response.data) { + console.log(` ${hw.id}`); + } +} + +async function listEndpoints(): Promise { + console.log("\n=== Your Endpoints ==="); + const response = await client.endpoints.list(); + for (const ep of response.data) { + console.log(` ${ep.id}: ${ep.model} (${ep.state})`); + } +} + +async function createEndpoint( + model: string, + hardware: string, + minReplicas: number = 1, + maxReplicas: number = 1, + displayName?: string, +): Promise { + const endpoint = await client.endpoints.create({ + model, + hardware, + autoscaling: { + min_replicas: minReplicas, + max_replicas: maxReplicas, + }, + ...(displayName && { display_name: displayName }), + }); + console.log(`Created endpoint: ${endpoint.id} (state: ${endpoint.state})`); + console.log(` Endpoint name (for inference): ${endpoint.name}`); + return endpoint; +} + +async function waitForReady( + endpointId: string, + timeoutMs: number = 600_000, + pollMs: number = 10_000, +): Promise { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + const endpoint = await client.endpoints.retrieve(endpointId); + const elapsed = Math.round((Date.now() - start) / 1000); + console.log(` State: ${endpoint.state} (${elapsed}s)`); + + if (endpoint.state === "STARTED") return endpoint; + if (endpoint.state === "ERROR") { + throw new Error(`Endpoint entered ERROR state: ${endpointId}`); + } + + await new Promise((r) => setTimeout(r, pollMs)); + } + throw new Error(`Endpoint not ready after ${timeoutMs / 1000}s`); +} + +async function runInference(endpointName: string, prompt: string): Promise { + const response = await client.chat.completions.create({ + model: endpointName, + messages: [{ role: "user", content: prompt }], + max_tokens: 200, + }); + const reply = response.choices[0].message.content ?? ""; + console.log(`Response: ${reply}`); + return reply; +} + +async function stopEndpoint(endpointId: string): Promise { + const endpoint = await client.endpoints.update(endpointId, { state: "STOPPED" }); + console.log(`Stopped endpoint: ${endpoint.id} (state: ${endpoint.state})`); +} + +async function deleteEndpoint(endpointId: string): Promise { + await client.endpoints.delete(endpointId); + console.log(`Deleted endpoint: ${endpointId}`); +} + +async function main(): Promise { + const MODEL = "Qwen/Qwen3.5-9B-FP8"; + const HARDWARE = "1x_nvidia_h100_80gb_sxm"; + + // 1. List available hardware + await listHardware(MODEL); + + // 2. List existing endpoints + await listEndpoints(); + + // 3. Create endpoint + const ep = await createEndpoint(MODEL, HARDWARE, 1, 1, "My Qwen Endpoint"); + + // 4. Wait until ready + const ready = await waitForReady(ep.id); + + // 5. Run inference + await runInference(ready.name, "What is the capital of France?"); + + // 6. Stop endpoint + await stopEndpoint(ep.id); + + // 7. Delete endpoint (uncomment to permanently remove) + // await deleteEndpoint(ep.id); +} + +main(); diff --git a/plugins/togetherai/skills/together-dedicated-endpoints/scripts/upload_custom_model.py b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/upload_custom_model.py new file mode 100644 index 00000000..0a97cf8b --- /dev/null +++ b/plugins/togetherai/skills/together-dedicated-endpoints/scripts/upload_custom_model.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +Together AI -- Upload and Deploy a Custom Model (v2 SDK) + +Upload a custom model from Hugging Face or S3, wait for the upload job +to complete, and optionally deploy it on a dedicated endpoint. + +Usage: + python upload_custom_model.py --model-name my-custom-model --hf-repo your-org/your-model --hardware 2x_nvidia_h100_80gb_sxm + python upload_custom_model.py --model-name my-custom-model --s3-url https://signed-url --skip-deploy + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import time + +from together import Together + +client = Together() + + +def upload_from_huggingface(model_name: str, hf_repo: str, hf_token: str | None = None) -> str: + """Upload a model from Hugging Face Hub and return the upload job id.""" + kwargs: dict = {"model_name": model_name, "model_source": hf_repo} + if hf_token: + kwargs["hf_token"] = hf_token + response = client.models.upload(**kwargs) + job_id = response.data.job_id + print(f"Upload started: job_id={job_id}") + return job_id + + +def upload_from_s3(model_name: str, presigned_url: str) -> str: + """Upload a model from an S3 presigned URL and return the upload job id.""" + response = client.models.upload(model_name=model_name, model_source=presigned_url) + job_id = response.data.job_id + print(f"Upload started: job_id={job_id}") + return job_id + + +def check_upload_status(job_id: str) -> str: + """Check upload job status via the v2 SDK.""" + response = client.models.uploads.status(job_id) + return response.status + + +def wait_for_upload(job_id: str, timeout: int = 3600, poll_interval: int = 30) -> None: + """Poll until the upload job completes.""" + elapsed = 0 + while elapsed < timeout: + status = check_upload_status(job_id) + print(f" Upload status: {status} ({elapsed}s)") + + if status == "Complete": + print("Upload complete.") + return + if status in ("Failed", "Cancelled"): + raise RuntimeError(f"Upload job {status}: {job_id}") + + time.sleep(poll_interval) + elapsed += poll_interval + + raise TimeoutError(f"Upload not complete after {timeout}s") + + +def list_hardware(model_name: str) -> list: + """List hardware available for a custom model.""" + response = client.endpoints.list_hardware(model=model_name) + for hw in response.data: + status = hw.availability.status if hw.availability else "unknown" + print(f" {hw.id} ({status})") + return response.data + + +def deploy_model(model_name: str, hardware: str, display_name: str | None = None): + """Deploy the uploaded model on a dedicated endpoint.""" + endpoint = client.endpoints.create( + model=model_name, + hardware=hardware, + autoscaling={"min_replicas": 1, "max_replicas": 1}, + display_name=display_name, + ) + print(f"Endpoint created: {endpoint.id} (state: {endpoint.state})") + print(f" Endpoint name (for inference): {endpoint.name}") + return endpoint + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Upload and optionally deploy a custom Together AI model") + parser.add_argument("--model-name", required=True, help="Target Together AI model name") + source_group = parser.add_mutually_exclusive_group(required=True) + source_group.add_argument("--hf-repo", help="Hugging Face repo id") + source_group.add_argument("--s3-url", help="Presigned S3 URL for a model archive") + parser.add_argument("--hf-token", help="Hugging Face token for private repos") + parser.add_argument("--hardware", help="Hardware id for optional deployment") + parser.add_argument("--display-name", help="Optional endpoint display name") + parser.add_argument("--show-hardware", action="store_true", help="List hardware after upload") + parser.add_argument("--skip-deploy", action="store_true", help="Upload the model without deploying it") + parser.add_argument("--timeout", type=int, default=3600, help="Maximum wait time in seconds") + parser.add_argument("--poll-interval", type=int, default=30, help="Seconds between status checks") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if args.hf_repo: + job_id = upload_from_huggingface(args.model_name, args.hf_repo, hf_token=args.hf_token) + else: + job_id = upload_from_s3(args.model_name, args.s3_url) + + wait_for_upload(job_id, timeout=args.timeout, poll_interval=args.poll_interval) + + if args.show_hardware or not args.skip_deploy: + print("\nAvailable hardware:") + list_hardware(args.model_name) + + if args.skip_deploy: + return + + if not args.hardware: + raise SystemExit("--hardware is required unless --skip-deploy is set") + + endpoint = deploy_model(args.model_name, args.hardware, display_name=args.display_name) + print(f"\nEndpoint ID: {endpoint.id}") + print(f"Use for inference: model='{endpoint.name}'") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-embeddings/SKILL.md b/plugins/togetherai/skills/together-embeddings/SKILL.md new file mode 100644 index 00000000..21003ba4 --- /dev/null +++ b/plugins/togetherai/skills/together-embeddings/SKILL.md @@ -0,0 +1,75 @@ +--- +name: together-embeddings +description: "Dense vector embeddings, semantic search, RAG pipelines, and reranking via Together AI. Generate embeddings with open-source models and rerank results behind dedicated endpoints. Reach for it whenever the user needs vector representations or retrieval quality improvements rather than direct text generation." +--- + +# Together Embeddings & Reranking + +## Overview + +Use this skill for semantic retrieval components: + +- create embeddings +- batch embeddings +- build retrieval or RAG pipelines +- rerank retrieved candidates + +This skill is for retrieval plumbing, not for the final language-model response itself. + +## When This Skill Wins + +- Build vector search or semantic similarity features +- Add embedding generation to a data pipeline +- Improve retrieval quality with reranking +- Assemble a retrieval stage before calling a chat model + +## Hand Off To Another Skill + +- Use `together-chat-completions` for the final answer-generation step +- Use `together-batch-inference` for very large offline embedding backfills +- Use `together-dedicated-endpoints` when reranking requires a dedicated deployment + +## Quick Routing + +- Embeddings API usage + - Read [references/api-reference.md](references/api-reference.md) + - Start with [scripts/embed_and_rerank.py](scripts/embed_and_rerank.py) or [scripts/embed_and_rerank.ts](scripts/embed_and_rerank.ts) +- Semantic search (embed, store, query) + - Start with [scripts/semantic_search.py](scripts/semantic_search.py) -- includes an in-memory vector store, cosine-similarity retrieval, and optional rerank +- RAG pipeline composition + - Start with [scripts/rag_pipeline.py](scripts/rag_pipeline.py) +- Model selection and rerank constraints + - Read [references/models.md](references/models.md) + +## Workflow + +1. Confirm that the user needs vectors or retrieval, not direct generation. +2. Choose the embedding model and batch shape. +3. Generate embeddings for corpus and query paths consistently. +4. Retrieve candidates. An in-memory cosine-similarity store works for prototyping and small corpora (see `semantic_search.py`). Use a dedicated vector database for production scale. +5. Rerank only when the extra latency and endpoint requirement are justified. When no dedicated rerank endpoint is available, cosine-similarity ranking is a reasonable fallback. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Keep embeddings and reranking conceptually separate; rerank is a second-stage precision step. +- Reranking in this repo assumes a dedicated endpoint. Do not promise serverless rerank unless the product changes. When no endpoint is available, fall back to cosine-similarity ranking. +- The embedding model has a 514-token context limit. Chunk longer documents before embedding. +- The `rag_pipeline.py` example demonstrates retrieval plus generation; treat generation as a hand-off to chat completions. +- Preserve model consistency across indexing and querying. + +## Resource Map + +- API details: [references/api-reference.md](references/api-reference.md) +- Model guide: [references/models.md](references/models.md) +- Python embeddings example: [scripts/embed_and_rerank.py](scripts/embed_and_rerank.py) +- TypeScript embeddings example: [scripts/embed_and_rerank.ts](scripts/embed_and_rerank.ts) +- Python semantic search: [scripts/semantic_search.py](scripts/semantic_search.py) +- Python RAG pipeline: [scripts/rag_pipeline.py](scripts/rag_pipeline.py) + +## Official Docs + +- [Embeddings Overview](https://docs.together.ai/docs/embeddings-overview) +- [Rerank Overview](https://docs.together.ai/docs/rerank-overview) +- [Embeddings API](https://docs.together.ai/reference/embeddings) +- [Rerank API](https://docs.together.ai/reference/rerank) diff --git a/plugins/togetherai/skills/together-embeddings/references/api-reference.md b/plugins/togetherai/skills/together-embeddings/references/api-reference.md new file mode 100644 index 00000000..78f7041b --- /dev/null +++ b/plugins/togetherai/skills/together-embeddings/references/api-reference.md @@ -0,0 +1,174 @@ +# Embeddings & Rerank API Reference +## Contents + +- [Endpoints](#endpoints) +- [Create Embeddings](#create-embeddings) +- [Rerank Documents](#rerank-documents) +- [HTTP Status Codes](#http-status-codes) + + +Base URL: `https://api.together.xyz/v1` + +## Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /embeddings` | Generate embeddings | Convert text to vector representations | +| `POST /rerank` | Rerank documents | Reorder documents by relevance to a query (dedicated endpoint required) | + +## Create Embeddings + +### Single Input + +```python +from together import Together +client = Together() + +response = client.embeddings.create( + model="intfloat/multilingual-e5-large-instruct", + input="Our solar system orbits the Milky Way galaxy at about 515,000 mph", +) +print(response.data[0].embedding[:5]) +``` + +```typescript +import Together from "together-ai"; +const client = new Together(); + +const response = await client.embeddings.create({ + model: "intfloat/multilingual-e5-large-instruct", + input: "Our solar system orbits the Milky Way galaxy at about 515,000 mph", +}); +console.log(response.data[0].embedding.slice(0, 5)); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/embeddings" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "intfloat/multilingual-e5-large-instruct", + "input": "Our solar system orbits the Milky Way galaxy at about 515,000 mph" + }' +``` + +### Batch Input + +Pass a list of strings to embed multiple texts in a single request. For large +corpora, batch in groups (e.g. 100 texts per call) to avoid timeouts and stay +within rate limits. + +```python +response = client.embeddings.create( + model="intfloat/multilingual-e5-large-instruct", + input=["First document", "Second document", "Third document"], +) +for item in response.data: + print(f"Index {item.index}: {len(item.embedding)} dimensions") +``` + +```typescript +const response = await client.embeddings.create({ + model: "intfloat/multilingual-e5-large-instruct", + input: ["First document", "Second document", "Third document"], +}); +for (const item of response.data) { + console.log(`Index ${item.index}: ${item.embedding.length} dimensions`); +} +``` + +Batching tip: For corpora larger than ~100 documents, split into batches: + +```python +batch_size = 100 +for start in range(0, len(texts), batch_size): + batch = texts[start : start + batch_size] + response = client.embeddings.create( + model="intfloat/multilingual-e5-large-instruct", + input=batch, + ) + # store response.data embeddings +``` + +### Request Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Embedding model identifier | +| `input` | string or string[] | Yes | Text(s) to embed | + +### Supported Models + +- `intfloat/multilingual-e5-large-instruct` + +### Response Schema + +```json +{ + "object": "list", + "model": "intfloat/multilingual-e5-large-instruct", + "data": [ + { + "object": "embedding", + "embedding": [0.0023, -0.0142, 0.0381, ...], + "index": 0 + } + ] +} +``` + +## Rerank Documents + +Reranking requires a dedicated endpoint. See the +[Rerank Overview](https://docs.together.ai/docs/rerank-overview) for current models and +setup instructions. + +### Request Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Rerank model identifier | +| `query` | string | Yes | Search query | +| `documents` | string[] or object[] | Yes | Documents to rerank | +| `top_n` | integer | No | Return only top N results | +| `return_documents` | boolean | No | Include document text in response | +| `rank_fields` | string[] | No | Fields to rank by for JSON objects | + +### Response Schema + +```json +{ + "object": "rerank", + "id": "rerank-abc123", + "model": "", + "results": [ + { + "index": 0, + "relevance_score": 0.9823, + "document": {"text": "..."} + }, + { + "index": 2, + "relevance_score": 0.8451 + } + ], + "usage": { + "prompt_tokens": 150, + "total_tokens": 150 + } +} +``` + +The `document` field is only present when `return_documents=true`. + +## HTTP Status Codes + +| Code | Description | +|------|-------------| +| 200 | Success | +| 400 | Bad request (invalid parameters) | +| 401 | Unauthorized (invalid API key) | +| 404 | Not found (invalid model) | +| 429 | Rate limit exceeded | +| 503 | Service overloaded | +| 504 | Request timeout | diff --git a/plugins/togetherai/skills/together-embeddings/references/models.md b/plugins/togetherai/skills/together-embeddings/references/models.md new file mode 100644 index 00000000..eae77cd1 --- /dev/null +++ b/plugins/togetherai/skills/together-embeddings/references/models.md @@ -0,0 +1,93 @@ +# Embedding & Rerank Models Reference + +## Embedding Models + +| Model | API String | Size | Dimensions | Context | Best For | +|-------|-----------|------|-----------|---------|----------| +| Multilingual E5 Large | `intfloat/multilingual-e5-large-instruct` | 560M | 1,024 | 514 tokens | Multilingual retrieval (recommended) | + +## Rerank Models + +Reranking is currently available exclusively via dedicated endpoints. Deploy a rerank model +as a dedicated endpoint, then use the `/v1/rerank` API. + +See the [Rerank Overview](https://docs.together.ai/docs/rerank-overview) for available models +and setup instructions. + +## Embeddings API Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Embedding model identifier | +| `input` | string or string[] | Yes | Text(s) to embed | + +## Embeddings Response + +```json +{ + "object": "list", + "model": "intfloat/multilingual-e5-large-instruct", + "data": [ + { + "object": "embedding", + "embedding": [0.0023, -0.0142, ...], + "index": 0 + } + ] +} +``` + +## Rerank Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Rerank model identifier | +| `query` | string | Yes | Search query | +| `documents` | string[] or object[] | Yes | Documents to rerank. Pass objects with named fields for structured documents. | +| `top_n` | int | No | Return only top N results | +| `return_documents` | bool | No | Include document text in response | +| `rank_fields` | string[] | No | Fields to use for ranking when documents are JSON objects | + +## Rerank Response + +```json +{ + "object": "rerank", + "id": "rerank-abc123", + "model": "", + "results": [ + {"index": 0, "relevance_score": 0.9823}, + {"index": 3, "relevance_score": 0.8451}, + {"index": 1, "relevance_score": 0.2134} + ], + "usage": { + "prompt_tokens": 150, + "total_tokens": 150 + } +} +``` + +## Choosing a Model + +### Embeddings + +The active serverless embedding model is `intfloat/multilingual-e5-large-instruct` (1024 +dimensions, 514 token max input). It supports multilingual text and is recommended for all +embedding use cases including retrieval, semantic similarity, and classification. + +### Practical Notes + +- 514-token context limit: Input text beyond 514 tokens is truncated silently. For + longer documents (articles, product pages, support tickets), split into chunks before + embedding. A typical English sentence is ~20 tokens, so 514 tokens covers roughly a + short paragraph. +- Use the same model for indexing and querying. Mixing embedding models between corpus + and query will produce meaningless similarity scores. +- Cosine similarity works out of the box. E5 embeddings are normalized, so cosine + similarity and dot product give equivalent rankings. + +### Reranking + +There are currently no serverless rerank models. Reranking requires deploying a model on a +dedicated endpoint. See the [Rerank Overview](https://docs.together.ai/docs/rerank-overview) +for available models and instructions. diff --git a/plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.py b/plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.py new file mode 100644 index 00000000..e1daebf5 --- /dev/null +++ b/plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +Together AI Embeddings Pipeline (v2 SDK) + +Embed documents, compute similarity, and optionally rerank results. + +Reranking requires a dedicated endpoint. When no endpoint is configured the +rerank helper falls back to cosine-similarity order. See +https://docs.together.ai/docs/rerank-overview for setup instructions. + +Usage: + python embed_and_rerank.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import math +from together import Together + +client = Together() + +# Set to your dedicated rerank endpoint model name to enable API reranking. +# See https://docs.together.ai/docs/rerank-overview +RERANK_MODEL: str | None = None + + +def embed_texts( + texts: list[str], + model: str = "intfloat/multilingual-e5-large-instruct", +) -> list[list[float]]: + """Embed a list of texts, returns list of embedding vectors.""" + response = client.embeddings.create( + model=model, + input=texts, + ) + return [item.embedding for item in response.data] + + +def cosine_similarity(a: list[float], b: list[float]) -> float: + """Compute cosine similarity between two vectors.""" + dot = sum(x * y for x, y in zip(a, b)) + norm_a = math.sqrt(sum(x * x for x in a)) + norm_b = math.sqrt(sum(x * x for x in b)) + return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0 + + +# --- Rerank with fallback --- + +def rerank_documents( + query: str, + documents: list[str], + scores: list[float] | None = None, + top_n: int = 3, +) -> list[dict]: + """Rerank documents by relevance to a query. + + When RERANK_MODEL is set, calls the dedicated rerank endpoint. + Otherwise falls back to the cosine-similarity scores passed in. + """ + if RERANK_MODEL is not None: + response = client.rerank.create( + model=RERANK_MODEL, + query=query, + documents=documents, + top_n=top_n, + ) + return [ + { + "index": item.index, + "score": item.relevance_score, + "document": documents[item.index], + } + for item in response.results + ] + + # Fallback: rank by pre-computed cosine-similarity scores + if scores is None: + query_emb = embed_texts([query])[0] + doc_embs = embed_texts(documents) + scores = [cosine_similarity(query_emb, d) for d in doc_embs] + + ranked = sorted( + [{"index": i, "score": s, "document": d} + for i, (d, s) in enumerate(zip(documents, scores))], + key=lambda x: x["score"], + reverse=True, + ) + return ranked[:top_n] + + +def rerank_structured( + query: str, + documents: list[dict], + rank_fields: list[str], + top_n: int | None = None, +) -> list[dict]: + """Rerank structured JSON documents by specific fields. + + Requires a dedicated rerank endpoint (RERANK_MODEL must be set). + """ + if RERANK_MODEL is None: + raise RuntimeError( + "Structured reranking requires a dedicated endpoint. " + "Set RERANK_MODEL to your endpoint model name." + ) + kwargs: dict = { + "model": RERANK_MODEL, + "query": query, + "documents": documents, + "rank_fields": rank_fields, + "return_documents": True, + } + if top_n: + kwargs["top_n"] = top_n + + response = client.rerank.create(**kwargs) + return [ + { + "index": item.index, + "score": item.relevance_score, + "document": documents[item.index], + } + for item in response.results + ] + + +if __name__ == "__main__": + # --- Example 1: Embed and compute similarity --- + print("=== Embedding Similarity ===") + texts = [ + "Python is a popular programming language", + "JavaScript is used for web development", + "Machine learning uses statistical models", + ] + query = "What language is good for data science?" + + embeddings = embed_texts(texts + [query]) + query_emb = embeddings[-1] + doc_embs = embeddings[:-1] + + scores = [] + for i, text in enumerate(texts): + sim = cosine_similarity(query_emb, doc_embs[i]) + scores.append(sim) + print(f" {sim:.4f} -- {text}") + + # --- Example 2: Rerank (dedicated endpoint or cosine-similarity fallback) --- + print(f"\n=== Reranking ===") + if RERANK_MODEL is None: + print(" (no dedicated rerank endpoint -- using cosine-similarity fallback)") + + documents = [ + "Python is widely used in data science and machine learning.", + "Java is a popular language for enterprise applications.", + "R is a language designed for statistical computing.", + "JavaScript powers most web applications.", + "SQL is essential for database querying.", + ] + ranked = rerank_documents(query, documents, top_n=3) + for r in ranked: + print(f" [{r['score']:.4f}] {r['document']}") diff --git a/plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.ts b/plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.ts new file mode 100644 index 00000000..1f01bf7e --- /dev/null +++ b/plugins/togetherai/skills/together-embeddings/scripts/embed_and_rerank.ts @@ -0,0 +1,68 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Embeddings Pipeline + * + * Embed documents and compute similarity. + * + * Note: Reranking requires a dedicated endpoint. The rerank function in this + * file has been removed. See https://docs.together.ai/docs/rerank-overview + * for setup instructions. + * + * Usage: + * npx tsx embed_and_rerank.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +function cosineSimilarity(a: number[], b: number[]): number { + let dot = 0, normA = 0, normB = 0; + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + return dot / (Math.sqrt(normA) * Math.sqrt(normB)); +} + +async function embedTexts(texts: string[]): Promise { + const response = await client.embeddings.create({ + model: "intfloat/multilingual-e5-large-instruct", + input: texts, + }); + return response.data.map((item) => item.embedding); +} + +async function embeddingSimilarity(): Promise { + console.log("=== Embedding Similarity ==="); + const texts = [ + "Python is a popular programming language", + "JavaScript is used for web development", + "Machine learning uses statistical models", + ]; + const query = "What language is good for data science?"; + + const embeddings = await embedTexts([...texts, query]); + const queryEmb = embeddings[embeddings.length - 1]; + + for (let i = 0; i < texts.length; i++) { + const sim = cosineSimilarity(queryEmb, embeddings[i]); + console.log(` ${sim.toFixed(4)} -- ${texts[i]}`); + } +} + +// Note: Reranking requires a dedicated endpoint. +// See https://docs.together.ai/docs/rerank-overview for setup instructions. + +async function main(): Promise { + await embeddingSimilarity(); +} + +main(); diff --git a/plugins/togetherai/skills/together-embeddings/scripts/rag_pipeline.py b/plugins/togetherai/skills/together-embeddings/scripts/rag_pipeline.py new file mode 100644 index 00000000..b32b3ddb --- /dev/null +++ b/plugins/togetherai/skills/together-embeddings/scripts/rag_pipeline.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +""" +Together AI RAG Pipeline -- Embed, Retrieve, Generate (v2 SDK) + +Demonstrates a complete Retrieval-Augmented Generation pipeline using +Together AI embeddings and chat completions. + +Uses an in-memory vector store for simplicity. Replace with your preferred +vector database (Pinecone, Weaviate, Chroma, etc.) for production use. + +Note: Reranking requires a dedicated endpoint and is not included in this +pipeline. See https://docs.together.ai/docs/rerank-overview for details. + +Usage: + python rag_pipeline.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import math +from together import Together + +client = Together() + +EMBEDDING_MODEL = "intfloat/multilingual-e5-large-instruct" +# Reranking requires a dedicated endpoint. See: +# https://docs.together.ai/docs/rerank-overview +CHAT_MODEL = "openai/gpt-oss-20b" + + +# --- Simple in-memory vector store --- + +class Document: + """A document with text and its embedding vector.""" + + def __init__(self, text: str, embedding: list[float] | None = None): + self.text = text + self.embedding = embedding + + +class VectorStore: + """Minimal in-memory vector store using cosine similarity.""" + + def __init__(self): + self.documents: list[Document] = [] + + def add(self, texts: list[str]) -> None: + """Embed and store a list of texts.""" + response = client.embeddings.create( + model=EMBEDDING_MODEL, + input=texts, + ) + for i, item in enumerate(response.data): + self.documents.append(Document(texts[i], item.embedding)) + print(f"Indexed {len(texts)} documents ({len(self.documents)} total)") + + def search(self, query_embedding: list[float], top_k: int = 10) -> list[Document]: + """Return top_k most similar documents by cosine similarity.""" + scored = [] + for doc in self.documents: + sim = self._cosine_similarity(query_embedding, doc.embedding) + scored.append((sim, doc)) + scored.sort(key=lambda x: x[0], reverse=True) + return [doc for _, doc in scored[:top_k]] + + @staticmethod + def _cosine_similarity(a: list[float], b: list[float]) -> float: + dot = sum(x * y for x, y in zip(a, b)) + norm_a = math.sqrt(sum(x * x for x in a)) + norm_b = math.sqrt(sum(x * x for x in b)) + return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0 + + +# --- RAG Pipeline --- + +def rag_query(store: VectorStore, query: str, top_k: int = 5) -> str: + """Run the full RAG pipeline: embed -> retrieve -> generate.""" + + # 1. Embed the query + query_embedding = client.embeddings.create( + model=EMBEDDING_MODEL, + input=query, + ).data[0].embedding + + # 2. Retrieve candidates from vector store + candidates = store.search(query_embedding, top_k=top_k) + print(f"Retrieved {len(candidates)} candidates") + + # 3. Generate answer using top documents as context + context = "\n\n".join([c.text for c in candidates]) + response = client.chat.completions.create( + model=CHAT_MODEL, + messages=[ + { + "role": "system", + "content": ( + "Answer the user's question based on the following context. " + "If the context doesn't contain enough information, say so.\n\n" + f"Context:\n{context}" + ), + }, + {"role": "user", "content": query}, + ], + max_tokens=500, + ) + return response.choices[0].message.content + + +if __name__ == "__main__": + # Sample knowledge base + knowledge = [ + "Photosynthesis is the process by which green plants convert sunlight into " + "chemical energy. It takes place primarily in the leaves using chlorophyll.", + "The mitochondria is the powerhouse of the cell, responsible for producing " + "ATP through cellular respiration.", + "DNA replication is the process by which a cell copies its DNA before cell " + "division. It involves enzymes like helicase and DNA polymerase.", + "The water cycle describes how water evaporates from surfaces, rises into " + "the atmosphere, cools and condenses into clouds, and falls as precipitation.", + "Plate tectonics is a theory explaining the movement of Earth's lithospheric " + "plates. It accounts for earthquakes, volcanic activity, and mountain building.", + "The human immune system has two main components: innate immunity (immediate, " + "non-specific) and adaptive immunity (delayed, specific to pathogens).", + "Gravity is a fundamental force of attraction between objects with mass. " + "Einstein's general relativity describes it as the curvature of spacetime.", + "Evolution by natural selection is the process where organisms with favorable " + "traits are more likely to survive and reproduce.", + "The periodic table organizes chemical elements by atomic number. Elements in " + "the same group share similar chemical properties.", + "Neural networks are computing systems inspired by biological neurons. They " + "consist of layers of interconnected nodes that process information.", + ] + + # Build the vector store + store = VectorStore() + store.add(knowledge) + + # Run RAG queries + queries = [ + "How does photosynthesis work?", + "What causes earthquakes?", + "How does the immune system fight disease?", + ] + + for query in queries: + print(f"\n{'='*60}") + print(f"Query: {query}") + print(f"{'='*60}") + answer = rag_query(store, query) + print(f"\nAnswer: {answer}") diff --git a/plugins/togetherai/skills/together-embeddings/scripts/semantic_search.py b/plugins/togetherai/skills/together-embeddings/scripts/semantic_search.py new file mode 100644 index 00000000..d6aa6c6b --- /dev/null +++ b/plugins/togetherai/skills/together-embeddings/scripts/semantic_search.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Together AI Semantic Search Pipeline (v2 SDK) + +Embed a product corpus, store vectors in memory, query by similarity, +and optionally rerank results with a dedicated endpoint. + +This example sits between the basic similarity demo (embed_and_rerank.py) +and the full RAG pipeline (rag_pipeline.py). It covers the common case of +pure vector search without a chat-generation step. + +Usage: + python semantic_search.py + +Requires: + pip install together + export TOGETHER_API_KEY=your_key +""" + +import math +from together import Together + +client = Together() + +EMBEDDING_MODEL = "intfloat/multilingual-e5-large-instruct" + +# Set to your dedicated rerank endpoint model name to enable API reranking. +# See https://docs.together.ai/docs/rerank-overview +RERANK_MODEL: str | None = None + + +# --------------------------------------------------------------------------- +# In-memory vector store +# --------------------------------------------------------------------------- + +class VectorStore: + """Minimal in-memory vector store using cosine similarity. + + Good enough for prototyping and small corpora. For production, swap in a + dedicated vector database (Pinecone, Weaviate, Chroma, pgvector, etc.). + """ + + def __init__(self) -> None: + self.texts: list[str] = [] + self.embeddings: list[list[float]] = [] + + def add(self, texts: list[str], batch_size: int = 100) -> None: + """Embed and store texts. Batches requests to stay within limits.""" + for start in range(0, len(texts), batch_size): + batch = texts[start : start + batch_size] + response = client.embeddings.create( + model=EMBEDDING_MODEL, + input=batch, + ) + for i, item in enumerate(response.data): + self.texts.append(batch[i]) + self.embeddings.append(item.embedding) + print(f"Indexed {len(self.texts)} documents " + f"({len(self.embeddings[0])} dimensions each)") + + def search(self, query: str, top_k: int = 10) -> list[dict]: + """Embed a query and return the top_k most similar documents.""" + query_emb = client.embeddings.create( + model=EMBEDDING_MODEL, + input=query, + ).data[0].embedding + + scored: list[dict] = [] + for idx, emb in enumerate(self.embeddings): + sim = _cosine_similarity(query_emb, emb) + scored.append({"index": idx, "text": self.texts[idx], "score": sim}) + scored.sort(key=lambda x: x["score"], reverse=True) + return scored[:top_k] + + +def _cosine_similarity(a: list[float], b: list[float]) -> float: + dot = sum(x * y for x, y in zip(a, b)) + norm_a = math.sqrt(sum(x * x for x in a)) + norm_b = math.sqrt(sum(x * x for x in b)) + return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0 + + +# --------------------------------------------------------------------------- +# Optional rerank stage +# --------------------------------------------------------------------------- + +def rerank( + query: str, + candidates: list[dict], + top_n: int = 5, +) -> list[dict]: + """Rerank candidates via a dedicated endpoint, or fall back to + cosine-similarity order when no endpoint is configured.""" + if RERANK_MODEL is None: + print(" (no dedicated rerank endpoint -- keeping cosine-similarity order)") + return candidates[:top_n] + + documents = [c["text"] for c in candidates] + response = client.rerank.create( + model=RERANK_MODEL, + query=query, + documents=documents, + top_n=top_n, + return_documents=True, + ) + return [ + { + "index": candidates[item.index]["index"], + "text": documents[item.index], + "score": item.relevance_score, + } + for item in response.results + ] + + +# --------------------------------------------------------------------------- +# Example corpus and queries +# --------------------------------------------------------------------------- + +PRODUCTS = [ + "Lightweight mesh running shoes with responsive foam cushioning", + "Waterproof leather hiking boots with Vibram sole", + "Classic white canvas sneakers for everyday casual wear", + "Memory foam slip-on walking shoes for all-day comfort", + "Carbon-plate racing flats for marathon runners", + "Breathable trail running shoes with aggressive lug pattern", + "Minimalist barefoot running sandals with adjustable straps", + "Cushioned stability running shoes for overpronation support", + "High-top basketball shoes with ankle support and air units", + "Vegan leather formal Oxford shoes in matte black", + "Steel-toe work boots with electrical hazard protection", + "Soft knit slip-on sneakers with cloud-like insole", + "Women's lightweight cross-training shoes for HIIT workouts", + "Men's cushioned walking shoes with arch support inserts", + "Kids' velcro running shoes in neon green", + "Ultra-boost energy-return running shoes for long distances", + "Slip-resistant restaurant work clogs with padded collar", + "Retro suede skateboarding shoes with vulcanized sole", + "Waterproof Gore-Tex trail running shoes for wet conditions", + "Orthopedic walking shoes recommended by podiatrists", +] + + +if __name__ == "__main__": + # 1. Build the index + print("=== Indexing ===") + store = VectorStore() + store.add(PRODUCTS) + + # 2. Search + query = "comfortable running shoes" + print(f"\n=== Search: \"{query}\" ===") + results = store.search(query, top_k=10) + for rank, r in enumerate(results, 1): + print(f" {rank:>2}. [{r['score']:.4f}] {r['text']}") + + # 3. Rerank (falls back to cosine order without a dedicated endpoint) + print(f"\n=== Rerank top 5 ===") + top_5 = rerank(query, results, top_n=5) + for rank, r in enumerate(top_5, 1): + print(f" {rank:>2}. [{r['score']:.4f}] {r['text']}") diff --git a/plugins/togetherai/skills/together-evaluations/SKILL.md b/plugins/togetherai/skills/together-evaluations/SKILL.md new file mode 100644 index 00000000..045d7e9a --- /dev/null +++ b/plugins/togetherai/skills/together-evaluations/SKILL.md @@ -0,0 +1,81 @@ +--- +name: together-evaluations +description: "LLM-as-a-judge evaluation framework on Together AI. Classify, score, and compare model outputs, select judge models, use external-provider judges or targets, poll results and download reports. Reach for it whenever the user wants to benchmark outputs, grade responses, compare A/B variants, or operationalize automated evaluations." +--- + +# Together AI Evaluations + +## Overview + +Use Together AI evaluations when the user wants a managed LLM-as-a-judge workflow rather than an +ad hoc prompt loop. + +Core evaluation types: + +- Classify: assign outputs to labels +- Score: grade outputs on a numeric scale +- Compare: compare two candidate outputs with bias controls + +This skill also covers external providers used as judges or targets when the workflow still runs +through Together AI's evaluation system. + +## When This Skill Wins + +- Benchmark prompt variants, models, or product responses +- Grade quality, safety, policy compliance, or task success +- Run A/B comparisons between model outputs +- Build repeatable evaluation jobs with uploaded datasets +- Pull results programmatically after asynchronous execution + +## Hand Off To Another Skill + +- Use `together-chat-completions` for one-off inference or manual judge prompts +- Use `together-batch-inference` for bulk offline generation rather than evaluation +- Use `together-fine-tuning` when the user wants to improve the model instead of just measure it +- Use `together-dedicated-endpoints` only if the evaluation target itself is a dedicated endpoint + +## Quick Routing + +- Classify / Score / Compare job setup + - Start with [scripts/run_evaluation.py](scripts/run_evaluation.py) or [scripts/run_evaluation.ts](scripts/run_evaluation.ts) + - Read [references/api-reference.md](references/api-reference.md) for exact request shapes +- Dataset formatting + - Read the dataset sections in [references/api-reference.md](references/api-reference.md) +- Dataset columns, Jinja2 templates, or pre-generated responses + - Read the dataset and template sections in [references/api-reference.md](references/api-reference.md) + - Use `--eval-column`, `--model-a-column`, or `--model-b-column` in the scripts +- External providers as judge or target + - Read the model-source and provider sections in [references/api-reference.md](references/api-reference.md) + - Use the scripts with `--judge-model-source external`, `--eval-model-source external`, or compare-side source flags +- Polling, listing, or downloading results + - Use the retrieval endpoints documented in [references/api-reference.md](references/api-reference.md) + - Use `--download-results` in the scripts when you want the per-row JSONL locally + +## Workflow + +1. Identify whether the user needs classify, score, or compare. +2. Define the dataset schema before writing code. +3. Upload the dataset as an eval file and keep the returned file ID. +4. Configure judge and target models explicitly, especially when mixing providers. +5. Poll status until completion, then download the result file for analysis. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- The current SDK examples in this repo use `check=False` for eval uploads because local file validation can misclassify eval datasets. +- Treat dataset schema as part of the product contract; inconsistent fields cause downstream confusion. +- Compare evaluations are best when both candidate responses are already present in the dataset. +- Keep judge configuration explicit. Hidden defaults make benchmark interpretation harder. +- Use Together AI's managed evaluation job instead of rebuilding a manual judge loop when repeatability matters. + +## Resource Map + +- Full API reference: [references/api-reference.md](references/api-reference.md) +- Dataset formats, Jinja2 templates, and provider shortcuts: [references/api-reference.md](references/api-reference.md) +- Python workflow: [scripts/run_evaluation.py](scripts/run_evaluation.py) +- TypeScript workflow: [scripts/run_evaluation.ts](scripts/run_evaluation.ts) + +## Official Docs + +- [AI Evaluations](https://docs.together.ai/docs/ai-evaluations) +- [Evaluations API](https://docs.together.ai/reference/create-evaluation) diff --git a/plugins/togetherai/skills/together-evaluations/references/api-reference.md b/plugins/togetherai/skills/together-evaluations/references/api-reference.md new file mode 100644 index 00000000..622915dd --- /dev/null +++ b/plugins/togetherai/skills/together-evaluations/references/api-reference.md @@ -0,0 +1,758 @@ +# AI Evaluations API Reference +## Contents + +- [Endpoints](#endpoints) +- [Create Evaluation Request](#create-evaluation-request) +- [Judge Model Configuration](#judge-model-configuration) +- [Model Configuration (Evaluation Target)](#model-configuration) +- [Evaluation Job Response](#evaluation-job-response) +- [Result Schemas](#result-schemas) +- [Evaluation Types](#evaluation-types) +- [Dataset Format](#dataset-format) +- [Jinja2 Templates](#jinja2-templates) +- [External Judges and Targets](#external-judges-and-targets) +- [Retrieve Evaluation](#retrieve-evaluation) +- [Get Evaluation Status](#get-evaluation-status) +- [List Evaluations](#list-evaluations) +- [List Evaluation Models](#list-evaluation-models) +- [Download Result File](#download-result-file) +- [Model Sources](#model-sources) +- [UI-Based Evaluations](#ui-based-evaluations) +- [Evaluation Status Flow](#evaluation-status-flow) +- [CLI Commands](#cli-commands) + + +## Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /evaluation` | Create evaluation | Start a new evaluation job | +| `GET /evaluation/{id}` | Get evaluation | Retrieve evaluation details and results | +| `GET /evaluation/{id}/status` | Get status | Quick status and results check | +| `GET /evaluation` | List evaluations | List all evaluation jobs | +| `GET /evaluation/model-list` | List models | Models available for evaluation | + +Base URL: `https://api.together.xyz/v1` +Authentication: `Authorization: Bearer $TOGETHER_API_KEY` + +## Create Evaluation Request + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `type` | string | Yes | `classify`, `score`, or `compare` | +| `parameters` | object | Yes | Type-specific parameters (see below) | + +### Classify Parameters + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `judge` | JudgeModelConfig | Yes | Judge model configuration | +| `labels` | string[] | Yes | Classification categories (min 2) | +| `pass_labels` | string[] | Yes | Labels considered "passing" (min 1) | +| `input_data_file_path` | string | Yes | Uploaded dataset file ID | +| `model_to_evaluate` | ModelConfig or string | No | Model config object or dataset column name | + +### Score Parameters + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `judge` | JudgeModelConfig | Yes | Judge model configuration | +| `min_score` | float | Yes | Minimum score value | +| `max_score` | float | Yes | Maximum score value | +| `pass_threshold` | float | Yes | Score at/above which is "passing" | +| `input_data_file_path` | string | Yes | Uploaded dataset file ID | +| `model_to_evaluate` | ModelConfig or string | No | Model config object or dataset column name | + +### Compare Parameters + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `judge` | JudgeModelConfig | Yes | Judge model configuration | +| `input_data_file_path` | string | Yes | Uploaded dataset file ID | +| `model_a` | ModelConfig or string | No | Model A config or dataset column name | +| `model_b` | ModelConfig or string | No | Model B config or dataset column name | +| `disable_position_bias_correction` | boolean | No | Defaults to `false`. When `false`, the judge runs twice per sample (A then B, then B then A) and the verdicts are reconciled to cancel position bias; disagreements become "Tie". Set to `true` to run only the original-order pass, halving judge cost and latency at the cost of position-bias correction. | + +When both `model_a` and `model_b` are model configuration objects (not pre-generated column references), Together runs their inference in parallel, reducing total wall-clock time. + +## Judge Model Configuration + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `model` | string | Yes | Model name, endpoint ID, or external shortcut | +| `model_source` | string | Yes | `serverless`, `dedicated`, or `external` | +| `system_template` | string | Yes | Jinja2 system prompt for the judge | +| `max_tokens` | integer | No | Max tokens for the judge. Defaults to 32768. Increase for reasoning judges (e.g., Gemini or o-series) that consume output budget for chain-of-thought. | +| `temperature` | float | No | Sampling temperature for the judge. Defaults to 0.05. | +| `num_workers` | integer | No | Concurrent workers for judge inference. Defaults: `serverless` -> 25, `dedicated` -> 5 (minimum), `external` -> 2 for first-party APIs (OpenAI, Anthropic, Google) or 20 for proxy/aggregator endpoints (e.g. OpenRouter). Override to tune throughput. | +| `external_api_token` | string | No | API key for external providers | +| `external_base_url` | string | No | Custom OpenAI-compatible base URL | + +## Model Configuration (Evaluation Target) + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `model` | string | Yes | Model name, endpoint ID, or external shortcut | +| `model_source` | string | Yes | `serverless`, `dedicated`, or `external` | +| `system_template` | string | Yes | System prompt for generation | +| `input_template` | string | Yes | Jinja2 input template (e.g., `{{prompt}}`) | +| `max_tokens` | integer | Yes | Maximum generation tokens | +| `temperature` | float | Yes | Generation temperature (0 to 2) | +| `num_workers` | integer | No | Concurrent workers for target inference. Defaults: `serverless` -> 25, `dedicated` -> 5 (minimum), `external` -> 2 for first-party APIs or 20 for proxy endpoints. Override to tune throughput. | +| `external_api_token` | string | No | API key for external providers | +| `external_base_url` | string | No | Custom OpenAI-compatible base URL | + +Alternatively, pass a string (dataset column name) to evaluate pre-generated responses. + +## Evaluation Job Response + +| Field | Type | Description | +|-------|------|-------------| +| `workflow_id` | string | Unique evaluation job ID | +| `type` | string | `classify`, `score`, or `compare` | +| `owner_id` | string | Job owner ID | +| `status` | string | `pending`, `queued`, `running`, `completed`, `error`, `user_error` | +| `status_updates` | array | Historical status changes with timestamps | +| `parameters` | object | Evaluation configuration used | +| `results` | object | Type-specific results (see below) | +| `created_at` | datetime | Creation timestamp | +| `updated_at` | datetime | Last update timestamp | + +## Result Schemas + +### Classify Results + +| Field | Type | Description | +|-------|------|-------------| +| `label_counts` | object | Count per label (e.g., `{"Toxic": 5, "Non-toxic": 45}`) | +| `pass_percentage` | float | Percentage with pass labels | +| `generation_fail_count` | int | Failed generations | +| `judge_fail_count` | int | Unevaluated samples | +| `invalid_label_count` | int | Unparseable judge responses | +| `result_file_id` | string | Per-row results file | + +### Score Results + +| Field | Type | Description | +|-------|------|-------------| +| `aggregated_scores.mean_score` | float | Mean of all scores | +| `aggregated_scores.std_score` | float | Standard deviation | +| `aggregated_scores.pass_percentage` | float | Percentage meeting threshold | +| `generation_fail_count` | int | Failed generations | +| `judge_fail_count` | int | Unevaluated samples | +| `invalid_score_count` | int | Unparseable or out-of-range scores | +| `failed_samples` | int | Total failures | +| `result_file_id` | string | Per-row results file | + +### Compare Results + +| Field | Type | Description | +|-------|------|-------------| +| `A_wins` | int | Model A preference count (Python SDK: `a_wins`) | +| `B_wins` | int | Model B preference count (Python SDK: `b_wins`) | +| `Ties` | int | No clear winner count (Python SDK: `ties`) | +| `generation_fail_count` | int | Failed generations | +| `judge_fail_count` | int | Unevaluated samples | +| `result_file_id` | string | Pairwise decision details | + +> Note: The REST API returns `A_wins`, `B_wins`, and `Ties` but the Python SDK +> Pydantic models convert these to snake_case (`a_wins`, `b_wins`, `ties`). The TypeScript +> SDK preserves the original casing. + +## Evaluation Types + +### 1. Classify + +```python +from together import Together +client = Together() + +evaluation = client.evals.create( + type="classify", + parameters={ + "input_data_file_path": "file-abc123", + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": "Classify the response as Toxic or Non-toxic.", + }, + "labels": ["Toxic", "Non-toxic"], + "pass_labels": ["Non-toxic"], + "model_to_evaluate": { + "model": "Qwen/Qwen3.5-9B", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7, + }, + }, +) +print(evaluation.workflow_id) +``` + +```typescript +import Together from "together-ai"; +const client = new Together(); + +const evaluation = await client.evals.create({ + type: "classify", + parameters: { + input_data_file_path: "file-abc123", + judge: { + model: "deepseek-ai/DeepSeek-V4-Pro", + model_source: "serverless", + system_template: "Classify the response as Toxic or Non-toxic.", + }, + labels: ["Toxic", "Non-toxic"], + pass_labels: ["Non-toxic"], + model_to_evaluate: { + model: "Qwen/Qwen3.5-9B", + model_source: "serverless", + system_template: "You are a helpful assistant.", + input_template: "{{prompt}}", + max_tokens: 512, + temperature: 0.7, + }, + }, +}); +console.log(evaluation.workflow_id); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/evaluation" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "type": "classify", + "parameters": { + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": "Classify the response as Toxic or Non-toxic." + }, + "labels": ["Toxic", "Non-toxic"], + "pass_labels": ["Non-toxic"], + "model_to_evaluate": { + "model": "Qwen/Qwen3.5-9B", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7 + }, + "input_data_file_path": "file-abc123" + } + }' +``` + +### 2. Score + +```python +evaluation = client.evals.create( + type="score", + parameters={ + "input_data_file_path": "file-abc123", + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": "Rate the toxicity of the response from 1 to 10.", + }, + "min_score": 1.0, + "max_score": 10.0, + "pass_threshold": 7.0, + "model_to_evaluate": { + "model": "Qwen/Qwen3.5-9B", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7, + }, + }, +) +``` + +```typescript +const evaluation = await client.evals.create({ + type: "score", + parameters: { + input_data_file_path: "file-abc123", + judge: { + model: "deepseek-ai/DeepSeek-V4-Pro", + model_source: "serverless", + system_template: "Rate the toxicity of the response from 1 to 10.", + }, + min_score: 1.0, + max_score: 10.0, + pass_threshold: 7.0, + model_to_evaluate: { + model: "Qwen/Qwen3.5-9B", + model_source: "serverless", + system_template: "You are a helpful assistant.", + input_template: "{{prompt}}", + max_tokens: 512, + temperature: 0.7, + }, + }, +}); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/evaluation" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "type": "score", + "parameters": { + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": "Rate the toxicity of the response from 1 to 10." + }, + "min_score": 1.0, + "max_score": 10.0, + "pass_threshold": 7.0, + "model_to_evaluate": { + "model": "Qwen/Qwen3.5-9B", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7 + }, + "input_data_file_path": "file-abc123" + } + }' +``` + +### 3. Compare + +```python +evaluation = client.evals.create( + type="compare", + parameters={ + "input_data_file_path": "file-abc123", + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": "Assess which model has smarter and more helpful responses.", + }, + "model_a": { + "model": "Qwen/Qwen3-235B-A22B-Instruct-2507-tput", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7, + }, + "model_b": { + "model": "Qwen/Qwen3.5-9B", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7, + }, + }, +) +``` + +```typescript +const evaluation = await client.evals.create({ + type: "compare", + parameters: { + input_data_file_path: "file-abc123", + judge: { + model: "deepseek-ai/DeepSeek-V4-Pro", + model_source: "serverless", + system_template: + "Assess which model has smarter and more helpful responses.", + }, + model_a: { + model: "Qwen/Qwen3-235B-A22B-Instruct-2507-tput", + model_source: "serverless", + system_template: "You are a helpful assistant.", + input_template: "{{prompt}}", + max_tokens: 512, + temperature: 0.7, + }, + model_b: { + model: "Qwen/Qwen3.5-9B", + model_source: "serverless", + system_template: "You are a helpful assistant.", + input_template: "{{prompt}}", + max_tokens: 512, + temperature: 0.7, + }, + }, +}); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/evaluation" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "type": "compare", + "parameters": { + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": "Assess which model has smarter and more helpful responses." + }, + "model_a": { + "model": "Qwen/Qwen3-235B-A22B-Instruct-2507-tput", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7 + }, + "model_b": { + "model": "Qwen/Qwen3.5-9B", + "model_source": "serverless", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7 + }, + "input_data_file_path": "file-abc123" + } + }' +``` + +You can also compare pre-generated responses by passing dataset column names instead of model +configs: + +```python +evaluation = client.evals.create( + type="compare", + parameters={ + "input_data_file_path": "file-abc123", + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": ( + "Assess which response is better. Consider clarity, accuracy, and usefulness." + ), + }, + "model_a": "response_a", + "model_b": "response_b", + }, +) +``` + +## Dataset Format + +Upload JSONL or CSV with `purpose="eval"`. When using the SDK helpers in this repo, pass +`check=False` to bypass the local file checker for eval datasets. + +For classify and score jobs, include prompts and optionally pre-generated responses: + +```jsonl +{"prompt": "What is AI?", "response": "AI is artificial intelligence."} +{"prompt": "Capital of France?", "response": "The capital of France is Paris."} +``` + +For compare jobs with pre-generated outputs, include both candidate columns: + +```jsonl +{"prompt": "What is AI?", "response_a": "Answer from model A", "response_b": "Answer from model B"} +{"prompt": "Explain gravity.", "response_a": "A concise answer", "response_b": "A longer answer"} +``` + +If `model_to_evaluate`, `model_a`, or `model_b` is a model config instead of a column name, +Together generates the candidate responses at evaluation time. + +## Jinja2 Templates + +Both `system_template` and `input_template` support Jinja2 syntax: + +- `{{prompt}}` for simple substitution from dataset columns +- `{{metadata.topic}}` for nested field access +- conditionals and loops for more structured judge or generation prompts + +Typical target-model example: + +```json +{ + "system_template": "You are a helpful assistant focused on {{metadata.topic}}.", + "input_template": "Answer the following question:\n\n{{prompt}}" +} +``` + +## External Judges and Targets + +Use external providers as either judges or evaluation targets when the workflow still runs through +Together AI's evaluation system. + +### External model as evaluation target + +```python +evaluation = client.evals.create( + type="classify", + parameters={ + "input_data_file_path": "file-abc123", + "judge": { + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model_source": "serverless", + "system_template": "Classify the response as Toxic or Non-toxic.", + }, + "labels": ["Toxic", "Non-toxic"], + "pass_labels": ["Non-toxic"], + "model_to_evaluate": { + "model": "openai/gpt-5", + "model_source": "external", + "external_api_token": "sk-...", + "system_template": "You are a helpful assistant.", + "input_template": "{{prompt}}", + "max_tokens": 512, + "temperature": 0.7, + }, + }, +) +``` + +### External model as judge + +```python +evaluation = client.evals.create( + type="score", + parameters={ + "input_data_file_path": "file-abc123", + "judge": { + "model": "openai/gpt-5", + "model_source": "external", + "external_api_token": "sk-...", + "system_template": "Rate the response quality from 1 to 10.", + }, + "min_score": 1.0, + "max_score": 10.0, + "pass_threshold": 7.0, + "model_to_evaluate": "response", + }, +) +``` + +### Custom base URL + +```python +evaluation = client.evals.create( + type="classify", + parameters={ + "input_data_file_path": "file-abc123", + "judge": { + "model": "mistral-small-latest", + "model_source": "external", + "external_api_token": "your-mistral-key", + "external_base_url": "https://api.mistral.ai/", + "system_template": "Classify the response as Toxic or Non-toxic.", + }, + "labels": ["Toxic", "Non-toxic"], + "pass_labels": ["Non-toxic"], + "model_to_evaluate": "response", + }, +) +``` + +## Retrieve Evaluation + +```python +result = client.evals.retrieve("eval-abc123") +print(result.status, result.results) +``` + +```typescript +const result = await client.evals.retrieve("eval-abc123"); +console.log(result.status, result.results); +``` + +```shell +curl -X GET "https://api.together.xyz/v1/evaluation/eval-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +Example response: + +```json +{ + "workflow_id": "eval-7df2-1751287840", + "type": "compare", + "status": "completed", + "parameters": { "..." : "..." }, + "results": { + "A_wins": 1, + "B_wins": 13, + "Ties": 6, + "generation_fail_count": 0, + "judge_fail_count": 0, + "result_file_id": "file-95c8f0a3-e8cf-43ea-889a-e79b1f1ea1b9" + }, + "created_at": "2025-06-30T12:50:40.723521Z", + "updated_at": "2025-06-30T12:51:57.261342Z" +} +``` + +## Get Evaluation Status + +```python +status = client.evals.status("eval-abc123") +print(status.status, status.results) +``` + +```typescript +const status = await client.evals.status("eval-abc123"); +console.log(status.status, status.results); +``` + +```shell +curl -X GET "https://api.together.xyz/v1/evaluation/eval-abc123/status" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +## List Evaluations + +```python +evaluations = client.evals.list() +for e in evaluations: + print(e.workflow_id, e.status) +``` + +```typescript +const evaluations = await client.evals.list(); +for (const e of evaluations ?? []) { + console.log(e.workflow_id, e.status); +} +``` + +```shell +curl -X GET "https://api.together.xyz/v1/evaluation?status=completed&limit=10" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +Query parameters: `status` (optional filter), `limit` (default 10, max 100). + +## List Evaluation Models + +```python +models = client.evals.models() +``` + +```shell +curl -X GET "https://api.together.xyz/v1/evaluation/model-list" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +Query parameter: `model_source` (optional, defaults to `"all"`). + +## Download Result File + +```python +with client.files.with_streaming_response.content(id="file-abc123") as resp: + with open("results.jsonl", "wb") as f: + for chunk in resp.iter_bytes(): + f.write(chunk) +``` + +```typescript +const content = await client.files.content("file-abc123"); +const text = await content.text(); +console.log(text); +``` + +```shell +curl -X GET "https://api.together.xyz/v1/files//content" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -o results.jsonl +``` + +## Model Sources + +| Source | Description | Model field | +|--------|-------------|-------------| +| `serverless` | Together AI serverless models with structured output support | Model name (e.g., `deepseek-ai/DeepSeek-V4-Pro`) | +| `dedicated` | Your deployed dedicated endpoint | Endpoint ID | +| `external` | Third-party providers via shortcuts or custom URL | Provider shortcut (e.g., `openai/gpt-5`) | + +### External Provider Shortcuts + +| Provider | Models | +|----------|--------| +| OpenAI | `openai/gpt-5`, `openai/gpt-5-mini`, `openai/gpt-5-nano`, `openai/gpt-5.2`, `openai/gpt-5.2-pro`, `openai/gpt-4.1`, `openai/gpt-4o`, `openai/gpt-4o-mini` | +| Anthropic | `anthropic/claude-opus-4-5`, `anthropic/claude-sonnet-4-5`, `anthropic/claude-haiku-4-5`, `anthropic/claude-opus-4-1`, `anthropic/claude-opus-4-0`, `anthropic/claude-sonnet-4-0` | +| Google | `google/gemini-2.5-pro`, `google/gemini-2.5-flash`, `google/gemini-2.5-flash-lite`, `google/gemini-3-pro-preview` | + +For other providers, use `external_base_url` with any OpenAI-compatible chat/completions API. + +## UI-Based Evaluations + +Create and monitor evaluations via the Together AI dashboard at +[api.together.xyz/evaluations](https://api.together.xyz/evaluations) when the user wants a +no-code workflow or quick manual inspection. + +## Evaluation Status Flow + +`pending` -> `queued` -> `running` -> `completed` + +Error states: `error`, `user_error` + +Sub-1000 sample jobs typically complete within 1 hour. + +## CLI Commands + +### Create + +```shell +together evals create [OPTIONS] +``` + +| Option | Description | +|--------|-------------| +| `--type [classify\|score\|compare]` | Type of evaluation (required) | +| `--judge-model TEXT` | Judge model name or URL (required) | +| `--judge-model-source [serverless\|dedicated\|external]` | Source of the judge model (required) | +| `--judge-system-template TEXT` | System template for the judge (required) | +| `--judge-external-api-token-env ENV_VAR` | Environment variable holding the API token for an external judge; prefer env vars over raw CLI token arguments | +| `--judge-external-base-url TEXT` | Custom base URL for external judge | +| `--input-data-file-path TEXT` | Path to the input data file (required) | +| `--model-field TEXT` | Field in input file containing model-generated text | +| `--model-to-evaluate TEXT` | Model name for detailed config | +| `--model-to-evaluate-source [serverless\|dedicated\|external]` | Source of model to evaluate | +| `--model-to-evaluate-max-tokens INTEGER` | Max tokens for model to evaluate | +| `--model-to-evaluate-temperature FLOAT` | Temperature for model to evaluate | +| `--model-to-evaluate-system-template TEXT` | System template for model to evaluate | +| `--model-to-evaluate-input-template TEXT` | Input template for model to evaluate | +| `--labels TEXT` | Classify: comma-separated labels | +| `--pass-labels TEXT` | Classify: labels considered passing | +| `--min-score FLOAT` | Score: minimum score value | +| `--max-score FLOAT` | Score: maximum score value | +| `--pass-threshold FLOAT` | Score: threshold for passing | +| `--model-a TEXT` | Compare: model A name | +| `--model-a-source [serverless\|dedicated\|external]` | Compare: source of model A | +| `--model-b TEXT` | Compare: model B name | +| `--model-b-source [serverless\|dedicated\|external]` | Compare: source of model B | +| `--disable-position-bias-correction` | Compare: skip the flipped-order judge pass and run only a single judge pass (original order). Halves judge cost and latency at the expense of position-bias correction. Default: off (two-pass mode). | + +### List + +```shell +together evals list [OPTIONS] +``` + +| Option | Description | +|--------|-------------| +| `--status` | Filter: `pending`, `queued`, `running`, `completed`, `error`, `user_error` | +| `--limit` | Number of results (max 100) | + +### Retrieve + +```shell +together evals retrieve +``` + +### Status + +```shell +together evals status +``` diff --git a/plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.py b/plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.py new file mode 100644 index 00000000..afeb2a5b --- /dev/null +++ b/plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python3 +""" +Together AI Evaluations - Run Classify, Score, and Compare (v2 SDK) + +Upload an eval dataset, create an evaluation, poll for results, and optionally +download the per-row results file. Supports serverless, dedicated, and external +judge or target models, plus dataset-column evaluation for pre-generated +responses. + +Usage: + python run_evaluation.py --type classify + python run_evaluation.py --type score --dataset score_prompts.jsonl --eval-column response + python run_evaluation.py --type compare --model-a-column response_a --model-b-column response_b + python run_evaluation.py --type classify --eval-model openai/gpt-5 \ + --eval-model-source external --eval-external-api-token-env OPENAI_API_KEY + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import json +import os +import tempfile +import time +from pathlib import Path +from typing import Any + +from together import Together + +client = Together() + +MODEL_SOURCES = ("serverless", "dedicated", "external") +JUDGE_MODEL = "deepseek-ai/DeepSeek-V4-Pro" +EVAL_MODEL = "Qwen/Qwen3.5-9B" +DEFAULT_EVAL_SYSTEM_TEMPLATE = "You are a helpful assistant." +DEFAULT_INPUT_TEMPLATE = "{{prompt}}" +DEFAULT_CLASSIFY_TEMPLATE = "Classify the following text as positive, negative, or neutral sentiment." +DEFAULT_SCORE_TEMPLATE = ( + "Rate the quality of the response from 1 to 10, where 1 is very poor and 10 is excellent. " + "Consider accuracy, clarity, and completeness." +) +DEFAULT_COMPARE_TEMPLATE = ( + "Please assess which model has smarter and more helpful responses. Consider clarity, " + "accuracy, and usefulness." +) + + +def upload_dataset(dataset: list[dict[str, Any]]) -> str: + """Write dataset rows to JSONL and upload with purpose=eval.""" + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as temp_file: + for row in dataset: + temp_file.write(json.dumps(row) + "\n") + data_path = Path(temp_file.name) + + try: + file_response = client.files.upload(file=str(data_path), purpose="eval", check=False) + finally: + data_path.unlink(missing_ok=True) + + print(f"Uploaded dataset: {file_response.id}") + return file_response.id + + +def load_dataset(path: str | None, fallback_rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Load dataset rows from JSONL, or return bundled sample rows.""" + if not path: + return fallback_rows + + with open(path, encoding="utf-8") as handle: + return [json.loads(line) for line in handle if line.strip()] + + +def poll_evaluation(workflow_id: str, poll_interval: int) -> Any: + """Poll until the evaluation completes or fails.""" + while True: + result = client.evals.status(workflow_id) + print(f" Status: {result.status}") + + if result.status == "completed": + return result + if result.status in ("error", "user_error"): + print("Evaluation failed") + return result + + time.sleep(poll_interval) + + +def result_file_id(result: Any) -> str | None: + """Return the per-row result file ID when present.""" + results = getattr(result, "results", None) + if not results: + return None + return getattr(results, "result_file_id", None) + + +def download_result_file(file_id: str, output_path: str) -> None: + """Download the result JSONL file to a local path.""" + destination = Path(output_path) + destination.parent.mkdir(parents=True, exist_ok=True) + + with client.files.with_streaming_response.content(id=file_id) as response: + with open(destination, "wb") as handle: + for chunk in response.iter_bytes(): + handle.write(chunk) + + print(f"Saved result rows to {destination}") + + +def build_judge_config(args: argparse.Namespace, default_template: str) -> dict[str, Any]: + """Build judge model config for serverless, dedicated, or external judges.""" + config: dict[str, Any] = { + "model": args.judge_model, + "model_source": args.judge_model_source, + "system_template": args.judge_system_template or default_template, + } + if args.judge_external_api_token_env: + config["external_api_token"] = read_secret_from_env(args.judge_external_api_token_env) + if args.judge_external_base_url: + config["external_base_url"] = args.judge_external_base_url + return config + + +def build_model_config( + *, + model: str, + model_source: str, + system_template: str, + input_template: str, + max_tokens: int, + temperature: float, + external_api_token_env: str | None = None, + external_base_url: str | None = None, +) -> dict[str, Any]: + """Build an evaluation target config.""" + config: dict[str, Any] = { + "model": model, + "model_source": model_source, + "system_template": system_template, + "input_template": input_template, + "max_tokens": max_tokens, + "temperature": temperature, + } + if external_api_token_env: + config["external_api_token"] = read_secret_from_env(external_api_token_env) + if external_base_url: + config["external_base_url"] = external_base_url + return config + + +def read_secret_from_env(name: str) -> str: + """Read a secret from an environment variable instead of a CLI argument.""" + value = os.environ.get(name) + if not value: + raise RuntimeError(f"Set {name} before using the matching external provider token option") + return value + + +def sample_dataset_for_args(args: argparse.Namespace) -> list[dict[str, Any]]: + """Return a bundled sample dataset that matches the selected workflow.""" + if args.type == "compare" and args.model_a_column and args.model_b_column: + return [ + { + "prompt": "Explain the theory of relativity.", + args.model_a_column: "Relativity explains gravity as the curvature of spacetime.", + args.model_b_column: "Einstein's theory says mass bends spacetime and changes motion.", + }, + { + "prompt": "How does photosynthesis work?", + args.model_a_column: "Plants convert sunlight, water, and carbon dioxide into sugar.", + args.model_b_column: "Photosynthesis uses light energy to create glucose and oxygen.", + }, + ] + + if args.type in {"classify", "score"} and args.eval_column: + return [ + { + "prompt": "Summarize what artificial intelligence is.", + args.eval_column: "Artificial intelligence is software that performs tasks requiring reasoning or prediction.", + }, + { + "prompt": "What causes rainbows?", + args.eval_column: "Rainbows form when water droplets refract, reflect, and disperse sunlight.", + }, + ] + + samples: dict[str, list[dict[str, Any]]] = { + "classify": [ + {"prompt": "The product arrived on time and works perfectly!"}, + {"prompt": "Terrible experience. The item was broken."}, + {"prompt": "It's okay, nothing special."}, + ], + "score": [ + {"prompt": "Explain quantum computing in simple terms."}, + {"prompt": "What causes rainbows?"}, + {"prompt": "How do vaccines work?"}, + ], + "compare": [ + {"prompt": "Explain the theory of relativity."}, + {"prompt": "What is the meaning of life?"}, + {"prompt": "How does photosynthesis work?"}, + ], + } + return samples[args.type] + + +def maybe_download_results(args: argparse.Namespace, result: Any) -> None: + """Download result rows when requested and available.""" + file_id = result_file_id(result) + if file_id: + print(f" Result file: {file_id}") + if args.download_results and file_id: + download_result_file(file_id, args.download_results) + + +def run_classify(args: argparse.Namespace, dataset: list[dict[str, Any]]) -> None: + """Classify evaluation - categorize responses into labels.""" + print("\n=== Classify Evaluation ===") + file_id = upload_dataset(dataset) + + model_to_evaluate: str | dict[str, Any] + if args.eval_column: + model_to_evaluate = args.eval_column + print(f"Using dataset column for candidate responses: {args.eval_column}") + else: + model_to_evaluate = build_model_config( + model=args.eval_model, + model_source=args.eval_model_source, + system_template=args.eval_system_template, + input_template=args.input_template, + max_tokens=args.max_tokens, + temperature=args.temperature, + external_api_token_env=args.eval_external_api_token_env, + external_base_url=args.eval_external_base_url, + ) + + evaluation = client.evals.create( + type="classify", + parameters={ + "input_data_file_path": file_id, + "judge": build_judge_config(args, DEFAULT_CLASSIFY_TEMPLATE), + "labels": ["positive", "negative", "neutral"], + "pass_labels": ["positive"], + "model_to_evaluate": model_to_evaluate, + }, + ) + print(f"Created evaluation: {evaluation.workflow_id}") + + result = poll_evaluation(evaluation.workflow_id, poll_interval=args.poll_interval) + if getattr(result, "results", None): + print(f" Label counts: {result.results.label_counts}") + print(f" Pass percentage: {result.results.pass_percentage}") + maybe_download_results(args, result) + + +def run_score(args: argparse.Namespace, dataset: list[dict[str, Any]]) -> None: + """Score evaluation - rate responses on a numerical scale.""" + print("\n=== Score Evaluation ===") + file_id = upload_dataset(dataset) + + model_to_evaluate: str | dict[str, Any] + if args.eval_column: + model_to_evaluate = args.eval_column + print(f"Using dataset column for candidate responses: {args.eval_column}") + else: + model_to_evaluate = build_model_config( + model=args.eval_model, + model_source=args.eval_model_source, + system_template=args.eval_system_template, + input_template=args.input_template, + max_tokens=args.max_tokens, + temperature=args.temperature, + external_api_token_env=args.eval_external_api_token_env, + external_base_url=args.eval_external_base_url, + ) + + evaluation = client.evals.create( + type="score", + parameters={ + "input_data_file_path": file_id, + "judge": build_judge_config(args, DEFAULT_SCORE_TEMPLATE), + "min_score": 1.0, + "max_score": 10.0, + "pass_threshold": 7.0, + "model_to_evaluate": model_to_evaluate, + }, + ) + print(f"Created evaluation: {evaluation.workflow_id}") + + result = poll_evaluation(evaluation.workflow_id, poll_interval=args.poll_interval) + if getattr(result, "results", None): + scores = result.results.aggregated_scores + if scores: + print(f" Mean score: {scores.mean_score}") + print(f" Std score: {scores.std_score}") + print(f" Pass percentage: {scores.pass_percentage}") + maybe_download_results(args, result) + + +def run_compare(args: argparse.Namespace, dataset: list[dict[str, Any]]) -> None: + """Compare evaluation - A/B comparison between generated or dataset-column outputs.""" + print("\n=== Compare Evaluation ===") + file_id = upload_dataset(dataset) + + if args.model_a_column and args.model_b_column: + model_a: str | dict[str, Any] = args.model_a_column + model_b: str | dict[str, Any] = args.model_b_column + print(f"Using dataset columns for comparisons: {args.model_a_column} vs {args.model_b_column}") + else: + model_a = build_model_config( + model=args.model_a, + model_source=args.model_a_source, + system_template=args.eval_system_template, + input_template=args.input_template, + max_tokens=args.max_tokens, + temperature=args.temperature, + external_api_token_env=args.model_a_external_api_token_env, + external_base_url=args.model_a_external_base_url, + ) + model_b = build_model_config( + model=args.model_b, + model_source=args.model_b_source, + system_template=args.eval_system_template, + input_template=args.input_template, + max_tokens=args.max_tokens, + temperature=args.temperature, + external_api_token_env=args.model_b_external_api_token_env, + external_base_url=args.model_b_external_base_url, + ) + + parameters: dict[str, Any] = { + "input_data_file_path": file_id, + "judge": build_judge_config(args, DEFAULT_COMPARE_TEMPLATE), + "model_a": model_a, + "model_b": model_b, + } + if args.disable_position_bias_correction: + parameters["disable_position_bias_correction"] = True + print("Position-bias correction disabled - running a single judge pass") + + evaluation = client.evals.create( + type="compare", + parameters=parameters, + ) + print(f"Created evaluation: {evaluation.workflow_id}") + + result = poll_evaluation(evaluation.workflow_id, poll_interval=args.poll_interval) + if getattr(result, "results", None): + print(f" A wins: {result.results.a_wins}") + print(f" B wins: {result.results.b_wins}") + print(f" Ties: {result.results.ties}") + maybe_download_results(args, result) + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description="Together AI evaluations workflow") + parser.add_argument( + "--type", + choices=["classify", "score", "compare"], + default="classify", + help="Evaluation workflow to run", + ) + parser.add_argument("--dataset", help="Path to a JSONL dataset; uses bundled samples when omitted") + parser.add_argument("--judge-model", default=JUDGE_MODEL, help="Judge model, endpoint ID, or provider shortcut") + parser.add_argument( + "--judge-model-source", + choices=MODEL_SOURCES, + default="serverless", + help="Source for the judge model", + ) + parser.add_argument( + "--judge-system-template", + help="Override the default judge Jinja2 template for the selected evaluation type", + ) + parser.add_argument("--judge-external-api-token-env", help="Environment variable holding the external judge API key") + parser.add_argument("--judge-external-base-url", help="Custom OpenAI-compatible base URL for the judge") + parser.add_argument("--eval-model", default=EVAL_MODEL, help="Target model for classify or score") + parser.add_argument( + "--eval-model-source", + choices=MODEL_SOURCES, + default="serverless", + help="Source for the target model used in classify or score", + ) + parser.add_argument("--eval-column", help="Dataset column containing pre-generated responses") + parser.add_argument( + "--eval-system-template", + default=DEFAULT_EVAL_SYSTEM_TEMPLATE, + help="System template for model-based evaluation targets", + ) + parser.add_argument( + "--input-template", + default=DEFAULT_INPUT_TEMPLATE, + help="Jinja2 input template for model-based evaluation targets", + ) + parser.add_argument("--max-tokens", type=int, default=512, help="Maximum generation tokens") + parser.add_argument("--temperature", type=float, default=0.7, help="Generation temperature") + parser.add_argument("--eval-external-api-token-env", help="Environment variable holding the external evaluation target API key") + parser.add_argument("--eval-external-base-url", help="Custom OpenAI-compatible base URL for the target") + parser.add_argument( + "--model-a", + default="Qwen/Qwen3-235B-A22B-Instruct-2507-tput", + help="Model A for compare evaluations", + ) + parser.add_argument( + "--model-a-source", + choices=MODEL_SOURCES, + default="serverless", + help="Source for model A", + ) + parser.add_argument("--model-a-column", help="Dataset column containing pre-generated model A responses") + parser.add_argument("--model-a-external-api-token-env", help="Environment variable holding the external model A API key") + parser.add_argument("--model-a-external-base-url", help="Custom OpenAI-compatible base URL for model A") + parser.add_argument("--model-b", default=EVAL_MODEL, help="Model B for compare evaluations") + parser.add_argument( + "--model-b-source", + choices=MODEL_SOURCES, + default="serverless", + help="Source for model B", + ) + parser.add_argument("--model-b-column", help="Dataset column containing pre-generated model B responses") + parser.add_argument("--model-b-external-api-token-env", help="Environment variable holding the external model B API key") + parser.add_argument("--model-b-external-base-url", help="Custom OpenAI-compatible base URL for model B") + parser.add_argument( + "--disable-position-bias-correction", + action="store_true", + help=( + "Compare only: skip the flipped-order judge pass and run a single pass. " + "Halves judge cost and latency at the expense of position-bias correction." + ), + ) + parser.add_argument( + "--poll-interval", + type=int, + default=5, + help="Seconds between evaluation status checks", + ) + parser.add_argument( + "--download-results", + help="Optional local path for the per-row results JSONL file", + ) + args = parser.parse_args() + + if (args.model_a_column and not args.model_b_column) or (args.model_b_column and not args.model_a_column): + parser.error("--model-a-column and --model-b-column must be provided together") + if args.type != "compare" and (args.model_a_column or args.model_b_column): + parser.error("--model-a-column and --model-b-column only apply to --type compare") + if args.type != "compare" and args.disable_position_bias_correction: + parser.error("--disable-position-bias-correction only applies to --type compare") + return args + + +def main() -> None: + args = parse_args() + dataset = load_dataset(args.dataset, fallback_rows=sample_dataset_for_args(args)) + + if args.type == "classify": + run_classify(args, dataset) + elif args.type == "score": + run_score(args, dataset) + else: + run_compare(args, dataset) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.ts b/plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.ts new file mode 100644 index 00000000..69199b3c --- /dev/null +++ b/plugins/togetherai/skills/together-evaluations/scripts/run_evaluation.ts @@ -0,0 +1,556 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Evaluations - Run Classify, Score, and Compare (TypeScript SDK) + * + * Upload an eval dataset, create an evaluation, poll for results, and + * optionally download the per-row results file. Supports serverless, + * dedicated, and external judge or target models, plus dataset-column + * evaluation for pre-generated responses. + * + * Usage: + * npx tsx run_evaluation.ts --type classify + * npx tsx run_evaluation.ts --type score --dataset score_prompts.jsonl --eval-column response + * npx tsx run_evaluation.ts --type compare --model-a-column response_a --model-b-column response_b + * npx tsx run_evaluation.ts --type classify --eval-model openai/gpt-5 \ + * --eval-model-source external --eval-external-api-token-env OPENAI_API_KEY + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; + +import Together from "together-ai"; + +const client = new Together(); + +type EvalType = "classify" | "score" | "compare"; +type ModelSource = "serverless" | "dedicated" | "external"; +type DatasetRow = Record; + +type ScriptArgs = { + type: EvalType; + dataset?: string; + judgeModel: string; + judgeModelSource: ModelSource; + judgeSystemTemplate?: string; + judgeExternalApiTokenEnv?: string; + judgeExternalBaseUrl?: string; + evalModel: string; + evalModelSource: ModelSource; + evalColumn?: string; + evalSystemTemplate: string; + inputTemplate: string; + maxTokens: number; + temperature: number; + evalExternalApiTokenEnv?: string; + evalExternalBaseUrl?: string; + modelA: string; + modelASource: ModelSource; + modelAColumn?: string; + modelAExternalApiTokenEnv?: string; + modelAExternalBaseUrl?: string; + modelB: string; + modelBSource: ModelSource; + modelBColumn?: string; + modelBExternalApiTokenEnv?: string; + modelBExternalBaseUrl?: string; + disablePositionBiasCorrection: boolean; + pollInterval: number; + downloadResults?: string; +}; + +const JUDGE_MODEL = "deepseek-ai/DeepSeek-V4-Pro"; +const EVAL_MODEL = "Qwen/Qwen3.5-9B"; +const DEFAULT_EVAL_SYSTEM_TEMPLATE = "You are a helpful assistant."; +const DEFAULT_INPUT_TEMPLATE = "{{prompt}}"; +const DEFAULT_CLASSIFY_TEMPLATE = + "Classify the following text as positive, negative, or neutral sentiment."; +const DEFAULT_SCORE_TEMPLATE = + "Rate the quality of the response from 1 to 10, where 1 is very poor and 10 is excellent. Consider accuracy, clarity, and completeness."; +const DEFAULT_COMPARE_TEMPLATE = + "Please assess which model has smarter and more helpful responses. Consider clarity, accuracy, and usefulness."; + +function printHelp(): void { + console.log(`Together AI evaluations workflow + +Flags: + --type classify|score|compare + --dataset PATH + --judge-model MODEL + --judge-model-source serverless|dedicated|external + --judge-system-template TEMPLATE + --judge-external-api-token-env ENV_VAR + --judge-external-base-url URL + --eval-model MODEL + --eval-model-source serverless|dedicated|external + --eval-column COLUMN + --eval-system-template TEMPLATE + --input-template TEMPLATE + --max-tokens N + --temperature FLOAT + --eval-external-api-token-env ENV_VAR + --eval-external-base-url URL + --model-a MODEL + --model-a-source serverless|dedicated|external + --model-a-column COLUMN + --model-a-external-api-token-env ENV_VAR + --model-a-external-base-url URL + --model-b MODEL + --model-b-source serverless|dedicated|external + --model-b-column COLUMN + --model-b-external-api-token-env ENV_VAR + --model-b-external-base-url URL + --disable-position-bias-correction + --poll-interval SECONDS + --download-results PATH`); +} + +const BOOLEAN_FLAGS = new Set(["disable-position-bias-correction"]); + +type ParsedFlags = { + values: Record; + bools: Set; +}; + +function parseFlagMap(argv: string[]): ParsedFlags { + const values: Record = {}; + const bools = new Set(); + + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + if (arg === "--help") { + printHelp(); + process.exit(0); + } + if (!arg.startsWith("--")) { + throw new Error(`Unexpected argument: ${arg}`); + } + const flagName = arg.slice(2); + if (BOOLEAN_FLAGS.has(flagName)) { + bools.add(flagName); + continue; + } + const next = argv[i + 1]; + if (!next || next.startsWith("--")) { + throw new Error(`Missing value for ${arg}`); + } + values[flagName] = next; + i += 1; + } + + return { values, bools }; +} + +function parseEvalType(value: string | undefined): EvalType { + const next = value ?? "classify"; + if (next === "classify" || next === "score" || next === "compare") { + return next; + } + throw new Error(`Invalid --type value: ${next}`); +} + +function parseModelSource(value: string | undefined, flagName: string): ModelSource { + const next = value ?? "serverless"; + if (next === "serverless" || next === "dedicated" || next === "external") { + return next; + } + throw new Error(`Invalid ${flagName} value: ${next}`); +} + +function parseNumber(value: string | undefined, fallback: number, flagName: string): number { + if (value === undefined) { + return fallback; + } + const next = Number(value); + if (!Number.isFinite(next)) { + throw new Error(`Invalid ${flagName} value: ${value}`); + } + return next; +} + +function parseScriptArgs(): ScriptArgs { + const { values: flags, bools } = parseFlagMap(process.argv.slice(2)); + const type = parseEvalType(flags.type); + + const args: ScriptArgs = { + type, + dataset: flags.dataset, + judgeModel: flags["judge-model"] ?? JUDGE_MODEL, + judgeModelSource: parseModelSource(flags["judge-model-source"], "--judge-model-source"), + judgeSystemTemplate: flags["judge-system-template"], + judgeExternalApiTokenEnv: flags["judge-external-api-token-env"], + judgeExternalBaseUrl: flags["judge-external-base-url"], + evalModel: flags["eval-model"] ?? EVAL_MODEL, + evalModelSource: parseModelSource(flags["eval-model-source"], "--eval-model-source"), + evalColumn: flags["eval-column"], + evalSystemTemplate: flags["eval-system-template"] ?? DEFAULT_EVAL_SYSTEM_TEMPLATE, + inputTemplate: flags["input-template"] ?? DEFAULT_INPUT_TEMPLATE, + maxTokens: parseNumber(flags["max-tokens"], 512, "--max-tokens"), + temperature: parseNumber(flags.temperature, 0.7, "--temperature"), + evalExternalApiTokenEnv: flags["eval-external-api-token-env"], + evalExternalBaseUrl: flags["eval-external-base-url"], + modelA: flags["model-a"] ?? "Qwen/Qwen3-235B-A22B-Instruct-2507-tput", + modelASource: parseModelSource(flags["model-a-source"], "--model-a-source"), + modelAColumn: flags["model-a-column"], + modelAExternalApiTokenEnv: flags["model-a-external-api-token-env"], + modelAExternalBaseUrl: flags["model-a-external-base-url"], + modelB: flags["model-b"] ?? EVAL_MODEL, + modelBSource: parseModelSource(flags["model-b-source"], "--model-b-source"), + modelBColumn: flags["model-b-column"], + modelBExternalApiTokenEnv: flags["model-b-external-api-token-env"], + modelBExternalBaseUrl: flags["model-b-external-base-url"], + disablePositionBiasCorrection: bools.has("disable-position-bias-correction"), + pollInterval: parseNumber(flags["poll-interval"], 5, "--poll-interval"), + downloadResults: flags["download-results"], + }; + + if ((args.modelAColumn && !args.modelBColumn) || (args.modelBColumn && !args.modelAColumn)) { + throw new Error("--model-a-column and --model-b-column must be provided together"); + } + if (args.type !== "compare" && (args.modelAColumn || args.modelBColumn)) { + throw new Error("--model-a-column and --model-b-column only apply to --type compare"); + } + if (args.type !== "compare" && args.disablePositionBiasCorrection) { + throw new Error("--disable-position-bias-correction only applies to --type compare"); + } + + return args; +} + +async function uploadDataset(dataset: DatasetRow[]): Promise { + const dataPath = path.join(os.tmpdir(), `eval_data_${Date.now()}.jsonl`); + const lines = dataset.map((row) => JSON.stringify(row)).join("\n") + "\n"; + fs.writeFileSync(dataPath, lines, "utf8"); + + try { + const fileResponse = await client.files.upload(dataPath, "eval", false); + console.log(`Uploaded dataset: ${fileResponse.id}`); + return fileResponse.id; + } finally { + fs.rmSync(dataPath, { force: true }); + } +} + +function loadDataset(datasetPath: string | undefined, fallbackRows: DatasetRow[]): DatasetRow[] { + if (!datasetPath) { + return fallbackRows; + } + + return fs + .readFileSync(datasetPath, "utf8") + .split("\n") + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => JSON.parse(line) as DatasetRow); +} + +async function sleep(ms: number): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function pollEvaluation(workflowId: string, pollIntervalSeconds: number): Promise { + while (true) { + const result = await client.evals.status(workflowId); + console.log(` Status: ${result.status}`); + + if (result.status === "completed") { + return result; + } + if (result.status === "error" || result.status === "user_error") { + console.error("Evaluation failed"); + return result; + } + + await sleep(pollIntervalSeconds * 1000); + } +} + +function getResultFileId(result: any): string | undefined { + return result?.results?.result_file_id; +} + +function readSecretFromEnv(name: string): string { + const value = process.env[name]; + if (!value) { + throw new Error(`Set ${name} before using the matching external provider token option`); + } + return value; +} + +async function downloadResultFile(fileId: string, outputPath: string): Promise { + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + const response = await client.files.content(fileId); + const text = await response.text(); + fs.writeFileSync(outputPath, text, "utf8"); + console.log(`Saved result rows to ${outputPath}`); +} + +function buildJudgeConfig(args: ScriptArgs, defaultTemplate: string): Record { + const config: Record = { + model: args.judgeModel, + model_source: args.judgeModelSource, + system_template: args.judgeSystemTemplate ?? defaultTemplate, + }; + if (args.judgeExternalApiTokenEnv) { + config.external_api_token = readSecretFromEnv(args.judgeExternalApiTokenEnv); + } + if (args.judgeExternalBaseUrl) { + config.external_base_url = args.judgeExternalBaseUrl; + } + return config; +} + +function buildModelConfig(options: { + model: string; + modelSource: ModelSource; + systemTemplate: string; + inputTemplate: string; + maxTokens: number; + temperature: number; + externalApiTokenEnv?: string; + externalBaseUrl?: string; +}): Record { + const config: Record = { + model: options.model, + model_source: options.modelSource, + system_template: options.systemTemplate, + input_template: options.inputTemplate, + max_tokens: options.maxTokens, + temperature: options.temperature, + }; + if (options.externalApiTokenEnv) { + config.external_api_token = readSecretFromEnv(options.externalApiTokenEnv); + } + if (options.externalBaseUrl) { + config.external_base_url = options.externalBaseUrl; + } + return config; +} + +function sampleDatasetForArgs(args: ScriptArgs): DatasetRow[] { + if (args.type === "compare" && args.modelAColumn && args.modelBColumn) { + return [ + { + prompt: "Explain the theory of relativity.", + [args.modelAColumn]: "Relativity explains gravity as the curvature of spacetime.", + [args.modelBColumn]: "Einstein's theory says mass bends spacetime and changes motion.", + }, + { + prompt: "How does photosynthesis work?", + [args.modelAColumn]: "Plants convert sunlight, water, and carbon dioxide into sugar.", + [args.modelBColumn]: "Photosynthesis uses light energy to create glucose and oxygen.", + }, + ]; + } + + if ((args.type === "classify" || args.type === "score") && args.evalColumn) { + return [ + { + prompt: "Summarize what artificial intelligence is.", + [args.evalColumn]: + "Artificial intelligence is software that performs tasks requiring reasoning or prediction.", + }, + { + prompt: "What causes rainbows?", + [args.evalColumn]: + "Rainbows form when water droplets refract, reflect, and disperse sunlight.", + }, + ]; + } + + if (args.type === "classify") { + return [ + { prompt: "The product arrived on time and works perfectly!" }, + { prompt: "Terrible experience. The item was broken." }, + { prompt: "It's okay, nothing special." }, + ]; + } + if (args.type === "score") { + return [ + { prompt: "Explain quantum computing in simple terms." }, + { prompt: "What causes rainbows?" }, + { prompt: "How do vaccines work?" }, + ]; + } + return [ + { prompt: "Explain the theory of relativity." }, + { prompt: "What is the meaning of life?" }, + { prompt: "How does photosynthesis work?" }, + ]; +} + +async function maybeDownloadResults(args: ScriptArgs, result: any): Promise { + const fileId = getResultFileId(result); + if (fileId) { + console.log(` Result file: ${fileId}`); + } + if (args.downloadResults && fileId) { + await downloadResultFile(fileId, args.downloadResults); + } +} + +async function runClassify(args: ScriptArgs, dataset: DatasetRow[]): Promise { + console.log("\n=== Classify Evaluation ==="); + const fileId = await uploadDataset(dataset); + + const modelToEvaluate = + args.evalColumn ?? + buildModelConfig({ + model: args.evalModel, + modelSource: args.evalModelSource, + systemTemplate: args.evalSystemTemplate, + inputTemplate: args.inputTemplate, + maxTokens: args.maxTokens, + temperature: args.temperature, + externalApiTokenEnv: args.evalExternalApiTokenEnv, + externalBaseUrl: args.evalExternalBaseUrl, + }); + + if (typeof modelToEvaluate === "string") { + console.log(`Using dataset column for candidate responses: ${modelToEvaluate}`); + } + + const evaluation = await client.evals.create({ + type: "classify", + parameters: { + input_data_file_path: fileId, + judge: buildJudgeConfig(args, DEFAULT_CLASSIFY_TEMPLATE), + labels: ["positive", "negative", "neutral"], + pass_labels: ["positive"], + model_to_evaluate: modelToEvaluate, + }, + }); + console.log(`Created evaluation: ${evaluation.workflow_id}`); + + const result = await pollEvaluation(evaluation.workflow_id!, args.pollInterval); + if (result.results) { + console.log(` Label counts: ${JSON.stringify(result.results.label_counts ?? {})}`); + console.log(` Pass percentage: ${result.results.pass_percentage}`); + await maybeDownloadResults(args, result); + } +} + +async function runScore(args: ScriptArgs, dataset: DatasetRow[]): Promise { + console.log("\n=== Score Evaluation ==="); + const fileId = await uploadDataset(dataset); + + const modelToEvaluate = + args.evalColumn ?? + buildModelConfig({ + model: args.evalModel, + modelSource: args.evalModelSource, + systemTemplate: args.evalSystemTemplate, + inputTemplate: args.inputTemplate, + maxTokens: args.maxTokens, + temperature: args.temperature, + externalApiTokenEnv: args.evalExternalApiTokenEnv, + externalBaseUrl: args.evalExternalBaseUrl, + }); + + if (typeof modelToEvaluate === "string") { + console.log(`Using dataset column for candidate responses: ${modelToEvaluate}`); + } + + const evaluation = await client.evals.create({ + type: "score", + parameters: { + input_data_file_path: fileId, + judge: buildJudgeConfig(args, DEFAULT_SCORE_TEMPLATE), + min_score: 1.0, + max_score: 10.0, + pass_threshold: 7.0, + model_to_evaluate: modelToEvaluate, + }, + }); + console.log(`Created evaluation: ${evaluation.workflow_id}`); + + const result = await pollEvaluation(evaluation.workflow_id!, args.pollInterval); + if (result.results?.aggregated_scores) { + const scores = result.results.aggregated_scores; + console.log(` Mean score: ${scores.mean_score}`); + console.log(` Std score: ${scores.std_score}`); + console.log(` Pass percentage: ${scores.pass_percentage}`); + await maybeDownloadResults(args, result); + } +} + +async function runCompare(args: ScriptArgs, dataset: DatasetRow[]): Promise { + console.log("\n=== Compare Evaluation ==="); + const fileId = await uploadDataset(dataset); + + const modelA = + args.modelAColumn ?? + buildModelConfig({ + model: args.modelA, + modelSource: args.modelASource, + systemTemplate: args.evalSystemTemplate, + inputTemplate: args.inputTemplate, + maxTokens: args.maxTokens, + temperature: args.temperature, + externalApiTokenEnv: args.modelAExternalApiTokenEnv, + externalBaseUrl: args.modelAExternalBaseUrl, + }); + const modelB = + args.modelBColumn ?? + buildModelConfig({ + model: args.modelB, + modelSource: args.modelBSource, + systemTemplate: args.evalSystemTemplate, + inputTemplate: args.inputTemplate, + maxTokens: args.maxTokens, + temperature: args.temperature, + externalApiTokenEnv: args.modelBExternalApiTokenEnv, + externalBaseUrl: args.modelBExternalBaseUrl, + }); + + if (typeof modelA === "string" && typeof modelB === "string") { + console.log(`Using dataset columns for comparisons: ${modelA} vs ${modelB}`); + } + + const parameters: Record = { + input_data_file_path: fileId, + judge: buildJudgeConfig(args, DEFAULT_COMPARE_TEMPLATE), + model_a: modelA, + model_b: modelB, + }; + if (args.disablePositionBiasCorrection) { + parameters.disable_position_bias_correction = true; + console.log("Position-bias correction disabled - running a single judge pass"); + } + + const evaluation = await client.evals.create({ + type: "compare", + parameters: parameters as any, + }); + console.log(`Created evaluation: ${evaluation.workflow_id}`); + + const result = await pollEvaluation(evaluation.workflow_id!, args.pollInterval); + if (result.results) { + console.log(` A wins: ${result.results.A_wins}`); + console.log(` B wins: ${result.results.B_wins}`); + console.log(` Ties: ${result.results.Ties}`); + await maybeDownloadResults(args, result); + } +} + +async function main(): Promise { + const args = parseScriptArgs(); + const dataset = loadDataset(args.dataset, sampleDatasetForArgs(args)); + + if (args.type === "classify") { + await runClassify(args, dataset); + } else if (args.type === "score") { + await runScore(args, dataset); + } else { + await runCompare(args, dataset); + } +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/plugins/togetherai/skills/together-fine-tuning/SKILL.md b/plugins/togetherai/skills/together-fine-tuning/SKILL.md new file mode 100644 index 00000000..c965ba9d --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/SKILL.md @@ -0,0 +1,88 @@ +--- +name: together-fine-tuning +description: "LoRA, full fine-tuning, DPO preference tuning, VLM training, function-calling tuning, reasoning tuning, and BYOM uploads on Together AI. Reach for it whenever the user wants to adapt a model on custom data rather than only run inference, evaluate outputs, or host an existing model." +--- + +# Together Fine-Tuning + +## Overview + +Use Together AI fine-tuning when the user needs to adapt a model to their own data or behavior. + +Supported workflows in this repo: + +- LoRA fine-tuning +- full fine-tuning +- DPO preference tuning +- VLM fine-tuning +- function-calling fine-tuning +- reasoning fine-tuning +- BYOM upload paths + +## When This Skill Wins + +- Train a model on custom instruction or conversational data +- Improve function-calling reliability with supervised examples +- Train on preferences rather than only demonstrations +- Fine-tune multimodal or reasoning-oriented models +- Deploy a fine-tuned output model later through dedicated endpoints + +## Hand Off To Another Skill + +- Use `together-chat-completions` for plain inference without training +- Use `together-evaluations` to measure a model before or after tuning +- Use `together-dedicated-endpoints` to host the resulting tuned model +- Use `together-gpu-clusters` only when the user needs raw infrastructure rather than managed tuning + +## Quick Routing + +- Standard LoRA or full fine-tuning + - Start with [scripts/finetune_workflow.py](scripts/finetune_workflow.py) + - Read [references/data-formats.md](references/data-formats.md) +- DPO preference tuning + - Start with [scripts/dpo_workflow.py](scripts/dpo_workflow.py) +- Function-calling tuning + - Start with [scripts/function_calling_finetune.py](scripts/function_calling_finetune.py) +- Reasoning tuning + - Start with [scripts/reasoning_finetune.py](scripts/reasoning_finetune.py) +- VLM tuning + - Start with [scripts/vlm_finetune.py](scripts/vlm_finetune.py) +- Model support and deployment options + - Read [references/supported-models.md](references/supported-models.md) + - Read [references/deployment.md](references/deployment.md) + +## Workflow + +1. Choose the tuning method that matches the desired behavior change. +2. Validate dataset format before spending tokens on training. +3. Upload training data and keep the returned file ID. +4. Create the job with explicit method-specific parameters. +5. Monitor job state, events, checkpoints, and per-step training metrics before handing off to deployment. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Prefer LoRA unless the user has a specific reason to pay for full fine-tuning. +- Keep data-format validation close to the upload step so bad files fail early. +- Treat deployment as a separate phase; fine-tuning success does not automatically mean serving success. +- Use the method-specific script instead of overloading one generic workflow for all modes. +- Parameterize dataset paths, model IDs, and suffixes in automation instead of embedding one demo dataset forever. + +## Resource Map + +- Data formats: [references/data-formats.md](references/data-formats.md) +- Supported models: [references/supported-models.md](references/supported-models.md) +- Deployment guide: [references/deployment.md](references/deployment.md) +- LoRA or full workflow: [scripts/finetune_workflow.py](scripts/finetune_workflow.py) +- DPO workflow: [scripts/dpo_workflow.py](scripts/dpo_workflow.py) +- Function-calling workflow: [scripts/function_calling_finetune.py](scripts/function_calling_finetune.py) +- Reasoning workflow: [scripts/reasoning_finetune.py](scripts/reasoning_finetune.py) +- VLM workflow: [scripts/vlm_finetune.py](scripts/vlm_finetune.py) + +## Official Docs + +- [Fine-tuning Quickstart](https://docs.together.ai/docs/fine-tuning-quickstart) +- [Data Preparation](https://docs.together.ai/docs/fine-tuning-data-preparation) +- [Fine-tuning Models](https://docs.together.ai/docs/fine-tuning-models) +- [Deploying a Fine-Tuned Model](https://docs.together.ai/docs/deploying-a-fine-tuned-model) +- [Fine-tuning API](https://docs.together.ai/reference/post-fine-tunes) diff --git a/plugins/togetherai/skills/together-fine-tuning/references/data-formats.md b/plugins/togetherai/skills/together-fine-tuning/references/data-formats.md new file mode 100644 index 00000000..8ae4fe43 --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/references/data-formats.md @@ -0,0 +1,368 @@ +# Fine-tuning Data Formats Reference +## Contents + +- [Format Overview](#format-overview) +- [Conversational Format](#conversational-format) +- [Instruction Format](#instruction-format) +- [Generic Text Format](#generic-text-format) +- [Preference/DPO Format](#preferencedpo-format) +- [Reasoning Format](#reasoning-format) +- [Function Calling Format](#function-calling-format) +- [VLM Conversational Format](#vlm-conversational-format) +- [VLM Instruction Format](#vlm-instruction-format) +- [File Formats](#file-formats) +- [Loss Masking](#loss-masking) +- [Sample Weights](#sample-weights) +- [Data Validation](#data-validation) +- [Converting Image URLs to Base64](#converting-image-urls-to-base64) + + +## Format Overview + +| Format | Use Case | Key Field | +|--------|----------|-----------| +| Conversational | Multi-turn chat | `messages` | +| Instruction | Prompt-completion pairs | `prompt` + `completion` | +| Generic Text | Text completion / pretraining | `text` | +| Preference/DPO | Preference learning | `input` + `preferred_output` + `non_preferred_output` | +| Reasoning | Chain-of-thought training | `messages` with `reasoning` field on assistant | +| Function Calling | Tool use training | `messages` + `tools` | +| VLM | Vision + language | `messages` with image content | + +## Conversational Format + +```json +{ + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi! How can I help?"}, + {"role": "user", "content": "Explain ML", "weight": 0}, + {"role": "assistant", "content": "Machine learning is...", "weight": 1} + ] +} +``` + +- `weight: 0` -- Exclude from loss (masking) +- `weight: 1` -- Include in loss (default for assistant) +- By default, only assistant messages are trained on + +### Preparing a Dataset (Python example) + +```python +from datasets import load_dataset + +coqa_dataset = load_dataset("stanfordnlp/coqa") + +system_prompt = "Read the story and extract answers for the questions.\nStory: {}" + +def map_fields(row): + messages = [{"role": "system", "content": system_prompt.format(row["story"])}] + for q, a in zip(row["questions"], row["answers"]["input_text"]): + messages.append({"role": "user", "content": q}) + messages.append({"role": "assistant", "content": a}) + return {"messages": messages} + +train_messages = coqa_dataset["train"].map( + map_fields, remove_columns=coqa_dataset["train"].column_names +) +train_messages.to_json("coqa_prepared_train.jsonl") +``` + +## Instruction Format + +```json +{"prompt": "What is photosynthesis?", "completion": "Photosynthesis is..."} +``` + +- By default, model not trained on prompt text +- Use `train_on_inputs=true` to train on prompts too + +## Generic Text Format + +```json +{"text": "The quick brown fox jumps over the lazy dog."} +``` + +## Preference/DPO Format + +```json +{ + "input": { + "messages": [ + {"role": "user", "content": "What's open-source AI?"} + ] + }, + "preferred_output": [ + {"role": "assistant", "content": "Open-source AI means models are free to use, modify, and share..."} + ], + "non_preferred_output": [ + {"role": "assistant", "content": "It means the code is public."} + ] +} +``` + +Both outputs must contain exactly one message from the assistant role. + +## Reasoning Format + +For fine-tuning reasoning models, assistant messages include a `reasoning` (or `reasoning_content`) +field containing the chain of thought, alongside the `content` field for the final answer: + +```json +{ + "messages": [ + {"role": "user", "content": "What is 15% of 240?"}, + { + "role": "assistant", + "reasoning": "15% means 15/100 = 0.15\n0.15 * 240 = 36", + "content": "15% of 240 is 36." + } + ] +} +``` + +For preference fine-tuning with reasoning, include `reasoning` in both outputs: + +```json +{ + "input": { + "messages": [{"role": "user", "content": "What is 15% of 240?"}] + }, + "preferred_output": [ + { + "role": "assistant", + "reasoning": "15% means 15/100 = 0.15\n0.15 * 240 = 36", + "content": "15% of 240 is 36." + } + ], + "non_preferred_output": [ + { + "role": "assistant", + "reasoning": "15% of 240... about 30 maybe?", + "content": "About 30." + } + ] +} +``` + +Supported models: Qwen3.5 family (0.8B-397B), Qwen3 family (0.6B-235B), Qwen3-Next-80B-A3B-Thinking, GLM-5.1, GLM-5, GLM-4.7, GLM-4.6. + +## Function Calling Format + +```json +{ + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"} + }, + "required": ["city"] + } + } + } + ], + "messages": [ + {"role": "user", "content": "What's the weather in NYC?"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "get_weather", "arguments": "{\"city\": \"New York\"}"} + } + ] + }, + {"role": "tool", "tool_call_id": "call_1", "content": "{\"temp\": 72, \"condition\": \"sunny\"}"}, + {"role": "assistant", "content": "It's currently 72F and sunny in New York City."} + ] +} +``` + +For preference fine-tuning with function calling, the `tools` field goes inside `input`: + +```json +{ + "input": { + "tools": [...], + "messages": [{"role": "user", "content": "..."}] + }, + "preferred_output": [{"role": "assistant", "tool_calls": [...]}], + "non_preferred_output": [{"role": "assistant", "content": "wrong answer"}] +} +``` + +## VLM Conversational Format + +```json +{ + "messages": [ + {"role": "system", "content": [{"type": "text", "text": "Vision assistant."}]}, + {"role": "user", "content": [ + {"type": "text", "text": "How many oranges?"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,iVBORw0KG..."}} + ]}, + {"role": "assistant", "content": [{"type": "text", "text": "There are 7 oranges."}]} + ] +} +``` + +- Images must be base64 encoded with MIME prefix +- Max 10 images per example, 10MB each +- Formats: PNG, JPEG, WEBP +- Only user messages can contain images + +## VLM Instruction Format + +```json +{ + "prompt": [ + {"type": "text", "text": "Describe this image."}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ], + "completion": [{"type": "text", "text": "The image shows..."}] +} +``` + +## File Formats + +### JSONL (Default) +- One JSON object per line +- Automatic sample packing for efficient training +- Max file size: 50GB + +### Parquet (Advanced) +- Pre-tokenized data +- Required columns: `input_ids`, `attention_mask` +- Optional: `labels` (use -100 to mask tokens from loss) +- Useful for custom tokenization or loss masking + +## Loss Masking + +- Conversational format: Use `weight: 0` on specific messages to exclude from loss (only `0` and `1` are accepted on messages; `1` is the default) +- `train_on_inputs` parameter: + - `"auto"` (default): Framework decides based on format + - `true`: Train on everything including user messages/prompts + - `false`: Only train on assistant/completion text +- Parquet format: Set label to -100 for tokens to exclude +- Per-sample loss scaling: Add a top-level `"weight"` to a JSONL sample to multiply the loss for all of its tokens (see [Sample Weights](#sample-weights)) + +## Sample Weights + +All JSONL fine-tuning formats (conversational, instruction, generic text, preference, reasoning, function calling) and all training methods support an optional top-level `"weight"` key on each JSON object. The value is a non-negative float that acts as a loss multiplier on every token in that sample, letting you up- or down-weight individual examples without changing the dataset itself. + +- Top-level `weight` is a non-negative float (e.g. `0.1`, `1.0`, `2.5`); `1.0` is the implicit default if omitted +- Distinct from the per-message `weight` field in conversational data, which only accepts `0` or `1` and gates whether a message's tokens enter the loss at all +- Sample weights and message weights can be combined in the same file +- Setting a sample's top-level `weight` to `0` effectively drops it from the loss while still keeping it in the dataset (e.g. for packing statistics) + +```json +{ + "messages": [ + {"role": "system", "content": "This is a system prompt."}, + {"role": "user", "content": "Hello, how are you?"}, + {"role": "assistant", "content": "I'm doing well, thank you! How can I help you?"}, + {"role": "user", "content": "Can you explain machine learning?", "weight": 0}, + {"role": "assistant", "content": "Machine learning is...", "weight": 1} + ], + "weight": 0.9 +} +{ + "messages": [ + {"role": "user", "content": "Can you explain why?"}, + {"role": "assistant", "content": "I can't."} + ], + "weight": 0.1 +} +``` + +## Data Validation + +Validation runs in two stages: + +1. Client-side structural check (local). Runs by default inside `client.files.upload(..., check=True)` or with `together files check`. Verifies only basic formatting: UTF-8 encoding, one JSON object per line, minimum sample count, and maximum file size. Pass `check=False` to skip (useful for very large files). +2. Server-side schema validation (during ingestion). Runs after upload and performs the full fine-tuning schema check (conversation roles, tool calls, required fields, etc.). The file is only usable for fine-tuning once `processing_status` becomes `COMPLETED`. If validation rejects the dataset, `processing_status` becomes `INVALID_FORMAT` and `validation_report.error` carries a user-facing reason. + +```python +import time +from together import Together + +client = Together() + +# 1. Upload with the local structural check enabled (default). +file = client.files.upload(file="my_data.jsonl", purpose="fine-tune", check=True) +print(file.id) # file-abc123 + +# 2. Poll until server-side validation finishes before creating a fine-tuning job. +while True: + meta = client.files.retrieve(file.id) + if meta.processing_status == "COMPLETED": + break + if meta.processing_status == "INVALID_FORMAT": + # meta.validation_report["error"] carries a user-facing reason. + raise ValueError( + f"file is not suitable for fine-tuning: {meta.validation_report}" + ) + if meta.processing_status == "FAILED": + raise RuntimeError( + f"file processing did not complete: {meta.validation_report}" + ) + time.sleep(5) +``` + +Treat `processing_status` as the authoritative readiness signal; the `validation_report` schema may evolve. A successful response looks like: + +```json +{ + "processing_status": "COMPLETED", + "validation_report": {"valid": true, "dataset_format": "conversation", "nlines": 7199} +} +``` + +A user-correctable failure looks like: + +```json +{ + "processing_status": "INVALID_FORMAT", + "validation_report": { + "valid": false, + "error_type": "INVALID_FORMAT", + "error": "Line 7: `messages[1]` must contain a `role` field" + } +} +``` + +```shell +# CLI: check format and upload +together files check my_data.jsonl +together files upload my_data.jsonl + +# Upload without the local structural check +together files upload my_data.jsonl --no-check + +# Inspect server-side validation status (processing_status / validation_report) +together files retrieve + +# List and download files +together files list +together files retrieve-content +``` + +## Converting Image URLs to Base64 + +```python +import base64 +import requests + +def url_to_base64(url: str, mime_type: str = "image/jpeg") -> str: + response = requests.get(url) + encoded = base64.b64encode(response.content).decode("utf-8") + return f"data:{mime_type};base64,{encoded}" +``` diff --git a/plugins/togetherai/skills/together-fine-tuning/references/deployment.md b/plugins/togetherai/skills/together-fine-tuning/references/deployment.md new file mode 100644 index 00000000..0b30ff62 --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/references/deployment.md @@ -0,0 +1,234 @@ +# Fine-tuned Model Deployment Reference +## Contents + +- [Deployment Options](#deployment-options) +- [Training Parameters](#training-parameters) +- [Job Monitoring](#job-monitoring) +- [Continued Fine-tuning](#continued-fine-tuning) +- [Pricing](#pricing) + + +## Deployment Options + +### Option 1: Dedicated Endpoint + +Deploy your fine-tuned model on a dedicated endpoint for production use. + +```python +endpoint = client.endpoints.create( + display_name="Fine-tuned Model", + model="your-username/Model-Name-your-suffix", + hardware="4x_nvidia_h100_80gb_sxm", + autoscaling={"min_replicas": 1, "max_replicas": 1}, +) +print(f"Endpoint ID: {endpoint.id}") + +# Wait for the endpoint to be ready +import time +while True: + ep = client.endpoints.retrieve(endpoint.id) + print(f" State: {ep.state}") + if ep.state == "STARTED": + break + if ep.state in ("FAILED", "STOPPED"): + raise SystemExit(1) + time.sleep(15) + +# Query via endpoint.name (not the model ID) +response = client.chat.completions.create( + model=endpoint.name, + messages=[{"role": "user", "content": "Hello!"}], + max_tokens=128, +) +print(response.choices[0].message.content) + +# Delete the endpoint when done to stop charges +client.endpoints.delete(endpoint.id) +``` + +- Per-minute hosting charges while running +- Guaranteed capacity and latency +- No rate limits, high max load +- Supports both LoRA and Full fine-tuned models + +### Option 2: Download Weights + +Download and run locally or on your infrastructure. + +```python +client.fine_tuning.download( + id="ft-abc123", + output="my-model/model.tar.zst", +) +``` + +```shell +# CLI: download model weights +together fine-tuning download ft-abc123 + +# Download to a specific directory +together fine-tuning download ft-abc123 --output_dir ./my-model + +# Download a specific checkpoint step +together fine-tuning download ft-abc123 --checkpoint-step 48 + +# Download merged or adapter-only weights (LoRA jobs) +together fine-tuning download ft-abc123 --checkpoint-type merged +together fine-tuning download ft-abc123 --checkpoint-type adapter +``` + +```shell +# Extract the downloaded archive +tar -xf model-name.tar.zst +``` + +Options: +- `--output_dir`, `-o` -- Specify the output directory +- `--checkpoint-step`, `-s` -- Download a specific checkpoint's weights (default: latest) +- `--checkpoint-type` -- `default`, `merged`, or `adapter` (merged/adapter only for LoRA jobs) + +## Training Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model` | string | Required | Base model | +| `training_file` | string | Required | File ID from upload | +| `validation_file` | string | - | Optional validation file | +| `suffix` | string | - | Custom model name suffix | +| `n_epochs` | int | 1-3 | Training epochs | +| `n_checkpoints` | int | 1 | Checkpoints to save | +| `batch_size` | int/str | `"max"` | Batch size (or "max" for auto) | +| `learning_rate` | float | ~1e-5 | Learning rate | +| `warmup_ratio` | float | 0 | Warmup step ratio | +| `lora` | bool | true | Use LoRA method | +| `lora_r` | int | 64 | LoRA rank. Per-model max -- 64 for most models; 16 for Moonshot Kimi-K2 family, Z.ai GLM-5/5.1, and DeepSeek R1/V3 (non-distill) families. See [supported-models.md](supported-models.md). | +| `lora_alpha` | int | 16 | LoRA scaling factor | +| `train_on_inputs` | bool/str | "auto" | Train on prompts/user msgs | +| `n_evals` | int | 0 | Validation evaluations (>0 to use validation set) | +| `wandb_api_key` | string | - | W&B integration | +| `from_checkpoint` | string | - | Continue from previous job ID | + +### DPO-specific Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `training_method` | string | "sft" | Set to `"dpo"` for preference tuning | +| `dpo_beta` | float | 0.1 | Deviation control (0.05-0.9) | + +### VLM-specific Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `train_vision` | bool | false | Update vision encoder weights | + +### BYOM-specific Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `from_hf_model` | string | - | HuggingFace model ID | +| `hf_api_token` | string | - | HuggingFace token (for private repos) | + +## Job Monitoring + +### Status Flow +`Pending` -> `Queued` -> `Running` -> `Uploading` -> `Completed` + +### Python SDK + +```python +from together import Together + +client = Together() + +# Get status +status = client.fine_tuning.retrieve(job_id) +print(status.status) + +# List events +events = client.fine_tuning.list_events(id=job_id) +for event in events.data: + print(event.message) + +# List checkpoints +checkpoints = client.fine_tuning.list_checkpoints(id=job_id) +for cp in checkpoints: + print(f"Step {cp.step}: {cp.metrics}") + +# List per-step training metrics (loss, learning rate, grad norm, eval/loss, ...) +metrics = client.fine_tuning.list_metrics(job_id) +for step in metrics.metrics: + print(step) +``` + +`list_metrics` accepts optional filters for trimming long runs: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `global_step_from` | int | Return only metrics with `global_step` >= this value | +| `global_step_to` | int | Return only metrics with `global_step` <= this value | +| `logged_at_from` | str or datetime | Return only metrics logged at or after this ISO 8601 timestamp | +| `logged_at_to` | str or datetime | Return only metrics logged at or before this ISO 8601 timestamp | +| `resolution` | int | Cap the response at this many uniformly sampled training points (eval metrics are always returned in full) | + +Each entry is either a training step (`train/global_step`, `train/loss`, `train/learning_rate`, `train/grad_norm`, ...) or an eval step (`eval/loss`, ...). When both occur at the same step, two separate objects are returned. + +### CLI + +```shell +together fine-tuning retrieve +together fine-tuning list-events +together fine-tuning list-checkpoints +together fine-tuning list-metrics # ASCII charts (default) +together fine-tuning list-metrics --json # raw JSON output +together fine-tuning list +together fine-tuning cancel +together fine-tuning delete +``` + +`list-metrics` also accepts `--global-step-from`, `--global-step-to`, `--logged-at-from`, `--logged-at-to`, and `--resolution` for the same filtering behavior as the Python SDK. + +### cURL + +```shell +# Retrieve job details +curl "https://api.together.xyz/v1/fine-tunes/ft-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" + +# List events +curl "https://api.together.xyz/v1/fine-tunes/ft-abc123/events" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" + +# List checkpoints +curl "https://api.together.xyz/v1/fine-tunes/ft-abc123/checkpoints" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" + +# Cancel job +curl -X POST "https://api.together.xyz/v1/fine-tunes/ft-abc123/cancel" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" + +# Delete job +curl -X DELETE "https://api.together.xyz/v1/fine-tunes/ft-abc123" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +## Continued Fine-tuning + +Resume from a previous job's checkpoint: + +```python +response = client.fine_tuning.create( + training_file=new_file_id, + model="Qwen/Qwen3-8B", + from_checkpoint=previous_job_id, +) +``` + +## Pricing + +- Based on total tokens processed: `total_tokens x per_token_rate` +- `total_tokens = (n_epochs x training_tokens) + (n_evals x validation_tokens)` +- Cost varies by model size, method (LoRA vs Full), and type (SFT vs DPO) +- No minimum price -- pay only for tokens processed +- Exact token count and price available after tokenization via dashboard or + `together fine-tuning retrieve $JOB_ID` +- Dedicated endpoint hosting charges are separate (per-minute while running) diff --git a/plugins/togetherai/skills/together-fine-tuning/references/supported-models.md b/plugins/togetherai/skills/together-fine-tuning/references/supported-models.md new file mode 100644 index 00000000..921fadac --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/references/supported-models.md @@ -0,0 +1,185 @@ +# Fine-tuning Supported Models +## Contents + +- [Recommended Starting Models](#recommended-starting-models) +- [LoRA Fine-tuning Models](#lora-fine-tuning-models) +- [Full Fine-tuning](#full-fine-tuning) +- [VLM Fine-tuning](#vlm-fine-tuning) +- [Reasoning Fine-tuning](#reasoning-fine-tuning) +- [DPO/Preference Training](#dpopreference-training) +- [BYOM (Bring Your Own Model)](#byom) + + +## Recommended Starting Models + +| Task | Model | API String | +|------|-------|-----------| +| Simple tasks | Qwen3 8B | `Qwen/Qwen3-8B` | +| Complex domains | Qwen3 32B | `Qwen/Qwen3-32B` | +| General (English) | Llama 3.1 8B | `meta-llama/Meta-Llama-3.1-8B-Instruct-Reference` | +| Reasoning | Qwen3 8B+ | `Qwen/Qwen3-8B` (or larger) | +| Vision | Qwen3-VL-8B | `Qwen/Qwen3-VL-8B-Instruct` | + +## LoRA Fine-tuning Models + +### Large Models (MoE / 70B+) + +| Organization | Model | API String | Context (SFT) | +|-------------|-------|-----------|---------------| +| Qwen | Qwen3.5 397B A17B | `Qwen/Qwen3.5-397B-A17B` | 32K | +| Qwen | Qwen3.5 122B A10B | `Qwen/Qwen3.5-122B-A10B` | 65K | +| Moonshot | Kimi K2.5 | `moonshotai/Kimi-K2.5` | 32K | +| Moonshot | Kimi K2 Thinking | `moonshotai/Kimi-K2-Thinking` | 32K | +| Moonshot | Kimi K2 Instruct 0905 | `moonshotai/Kimi-K2-Instruct-0905` | 32K | +| Moonshot | Kimi K2 Base | `moonshotai/Kimi-K2-Base` | 32K | +| Z.ai | GLM-5.1 | `zai-org/GLM-5.1` | 50K | +| Z.ai | GLM-5 | `zai-org/GLM-5` | 50K | +| Z.ai | GLM-4.7 | `zai-org/GLM-4.7` | 128K | +| Z.ai | GLM-4.6 | `zai-org/GLM-4.6` | 128K | +| OpenAI | GPT-OSS 120B | `openai/gpt-oss-120b` | 16K | +| OpenAI | GPT-OSS 20B | `openai/gpt-oss-20b` | 24K | +| DeepSeek | DeepSeek-R1-0528 | `deepseek-ai/DeepSeek-R1-0528` | 131K | +| DeepSeek | DeepSeek-R1 | `deepseek-ai/DeepSeek-R1` | 131K | +| DeepSeek | DeepSeek-V3.1 | `deepseek-ai/DeepSeek-V3.1` | 131K | +| DeepSeek | DeepSeek-V3-0324 | `deepseek-ai/DeepSeek-V3-0324` | 131K | +| DeepSeek | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3` | 131K | +| Qwen | Qwen3 235B A22B | `Qwen/Qwen3-235B-A22B` | 41K | +| Qwen | Qwen3 235B Instruct | `Qwen/Qwen3-235B-A22B-Instruct-2507` | 49K | +| Qwen | Qwen3-Coder 480B | `Qwen/Qwen3-Coder-480B-A35B-Instruct` | 262K | +| Qwen | Qwen3-Coder 30B A3B | `Qwen/Qwen3-Coder-30B-A3B-Instruct` | 262K | +| Meta | Llama 4 Maverick | `meta-llama/Llama-4-Maverick-17B-128E-Instruct` | 16K | +| Meta | Llama 4 Scout | `meta-llama/Llama-4-Scout-17B-16E-Instruct` | 65K | +| Meta | Llama 3.3 70B | `meta-llama/Llama-3.3-70B-Instruct-Reference` | 24K | +| Meta | Llama 3.1 70B | `meta-llama/Meta-Llama-3.1-70B-Instruct-Reference` | 24K | +| DeepSeek | R1 Distill Llama 70B | `deepseek-ai/DeepSeek-R1-Distill-Llama-70B` | 24K | +| Qwen | Qwen2.5 72B | `Qwen/Qwen2.5-72B-Instruct` | 24K | + +### Medium Models (7B-32B) + +| Organization | Model | API String | Context (SFT) | +|-------------|-------|-----------|---------------| +| Qwen | Qwen3.5 27B | `Qwen/Qwen3.5-27B` | 32K | +| Qwen | Qwen3.5 9B | `Qwen/Qwen3.5-9B` | 65K | +| Qwen | Qwen3.5 35B A3B | `Qwen/Qwen3.5-35B-A3B` | 65K | +| Qwen | Qwen3.6 35B A3B | `Qwen/Qwen3.6-35B-A3B` | 65K | +| Qwen | Qwen3 32B | `Qwen/Qwen3-32B` | 41K | +| Qwen | Qwen3 14B | `Qwen/Qwen3-14B` | 41K | +| Qwen | Qwen3 8B | `Qwen/Qwen3-8B` | 41K | +| Qwen | Qwen3-Next 80B A3B | `Qwen/Qwen3-Next-80B-A3B-Instruct` | 16K | +| Qwen | Qwen3 30B A3B | `Qwen/Qwen3-30B-A3B` | 8K | +| Qwen | Qwen2.5 32B Instruct | `Qwen/Qwen2.5-32B-Instruct` | 32K | +| Qwen | Qwen2.5 14B Instruct | `Qwen/Qwen2.5-14B-Instruct` | 32K | +| Qwen | Qwen2.5 7B Instruct | `Qwen/Qwen2.5-7B-Instruct` | 32K | +| Meta | Llama 3.1 8B | `meta-llama/Meta-Llama-3.1-8B-Instruct-Reference` | 131K | +| DeepSeek | R1 Distill Qwen 14B | `deepseek-ai/DeepSeek-R1-Distill-Qwen-14B` | 65K | +| NVIDIA | Nemotron Nano 9B v2 | `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | 32K | +| Google | Gemma 4 31B IT | `google/gemma-4-31B-it` | 49K | +| Google | Gemma 4 26B A4B IT | `google/gemma-4-26B-A4B-it` | 49K | +| Google | Gemma 3 27B | `google/gemma-3-27b-it` | 49K | +| Google | Gemma 3 12B | `google/gemma-3-12b-it` | 65K | +| Mistral | Mixtral 8x7B | `mistralai/Mixtral-8x7B-Instruct-v0.1` | 32K | +| Mistral | Mistral 7B v0.2 | `mistralai/Mistral-7B-Instruct-v0.2` | 32K | + +### Small Models (<7B) + +| Organization | Model | API String | Context (SFT) | +|-------------|-------|-----------|---------------| +| Qwen | Qwen3.5 4B | `Qwen/Qwen3.5-4B` | 131K | +| Qwen | Qwen3.5 2B | `Qwen/Qwen3.5-2B` | 131K | +| Qwen | Qwen3.5 0.8B | `Qwen/Qwen3.5-0.8B` | 131K | +| Qwen | Qwen3 4B | `Qwen/Qwen3-4B` | 41K | +| Qwen | Qwen3 1.7B | `Qwen/Qwen3-1.7B` | 41K | +| Qwen | Qwen3 0.6B | `Qwen/Qwen3-0.6B` | 41K | +| Meta | Llama 3.2 3B | `meta-llama/Llama-3.2-3B-Instruct` | 131K | +| Meta | Llama 3.2 1B | `meta-llama/Llama-3.2-1B-Instruct` | 131K | +| Google | Gemma 3 4B | `google/gemma-3-4b-it` | 131K | +| Google | Gemma 3 1B | `google/gemma-3-1b-it` | 32K | +| Google | Gemma 3 270M | `google/gemma-3-270m-it` | 32K | +| Qwen | Qwen2.5 3B | `Qwen/Qwen2.5-3B-Instruct` | 32K | +| Qwen | Qwen2.5 1.5B | `Qwen/Qwen2.5-1.5B-Instruct` | 32K | +| DeepSeek | R1 Distill Qwen 1.5B | `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B` | 131K | + +### Long-context LoRA (32K-131K) + +| Model | API String | Max Context | +|-------|-----------|-------------| +| DeepSeek R1 | `deepseek-ai/DeepSeek-R1` | 131K | +| Llama 3.3 70B 131K | `meta-llama/Llama-3.3-70B-131k-Instruct-Reference` | 131K | +| Llama 3.1 8B 131K | `meta-llama/Meta-Llama-3.1-8B-131k-Instruct-Reference` | 131K | +| Llama 3.1 70B 131K | `meta-llama/Meta-Llama-3.1-70B-131k-Instruct-Reference` | 131K | + +Note: Long-context fine-tuning of Llama 3.1 models (32K-131K) is only supported using LoRA. + +### Max LoRA rank caps + +The `lora_r` parameter defaults to 64 but is capped per model. Setting `lora_r` above the cap returns an error. + +| Cap | Models | +|-----|--------| +| 16 | Moonshot Kimi K2 family (`Kimi-K2.5`, `Kimi-K2-Thinking`, `Kimi-K2-Instruct-0905`, `Kimi-K2-Instruct`, `Kimi-K2-Base`) | +| 16 | Z.ai GLM-5, GLM-5.1 | +| 16 | DeepSeek R1 / R1-0528 / V3 / V3.1 / V3-0324 (and `-Base` variants); R1-Distill variants stay at 64 | +| 64 | All other LoRA-supported models (default) | + +## Full Fine-tuning + +Same models as LoRA, but batch sizes are generally smaller. Key full-fine-tuning-only models: + +| Organization | Model | API String | Context (SFT) | +|-------------|-------|-----------|---------------| +| DeepSeek | R1 Distill Llama 70B | `deepseek-ai/DeepSeek-R1-Distill-Llama-70B` | 24K | +| DeepSeek | R1 Distill Qwen 14B | `deepseek-ai/DeepSeek-R1-Distill-Qwen-14B` | 65K | +| DeepSeek | R1 Distill Qwen 1.5B | `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B` | 131K | +| Qwen | All Qwen3 variants | Various | 32K-41K | +| Google | All Gemma 3 variants | Various | 32K-131K | +| Meta | Llama 3.x variants | Various | 8K-131K | + +## VLM Fine-tuning + +| Model | API String | Full | LoRA | +|-------|-----------|------|------| +| Qwen3-VL-8B | `Qwen/Qwen3-VL-8B-Instruct` | Yes | Yes | +| Qwen3-VL-30B-A3B | `Qwen/Qwen3-VL-30B-A3B-Instruct` | Yes | Yes | +| Qwen3-VL-235B | `Qwen/Qwen3-VL-235B-A22B-Instruct` | No | Yes | +| Llama 4 Maverick VLM | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-VLM` | No | Yes | +| Llama 4 Scout VLM | `meta-llama/Llama-4-Scout-17B-16E-Instruct-VLM` | No | Yes | +| Gemma 3 4B VLM | `google/gemma-3-4b-it-VLM` | Yes | Yes | +| Gemma 3 12B VLM | `google/gemma-3-12b-it-VLM` | Yes | Yes | +| Gemma 3 27B VLM | `google/gemma-3-27b-it-VLM` | Yes | Yes | + +## Reasoning Fine-tuning + +| Organization | Model | API String | +|-------------|-------|-----------| +| Qwen | Qwen3.5 family | `Qwen/Qwen3.5-*` (0.8B, 2B, 4B, 9B, 27B, 35B-A3B, 122B-A10B, 397B-A17B) | +| Qwen | Qwen3 0.6B - 235B | `Qwen/Qwen3-*` (all sizes and base variants) | +| Qwen | Qwen3 30B A3B | `Qwen/Qwen3-30B-A3B` (and base) | +| Qwen | Qwen3-Next 80B Thinking | `Qwen/Qwen3-Next-80B-A3B-Thinking` | +| Z.ai | GLM 5.1 | `zai-org/GLM-5.1` | +| Z.ai | GLM 5 | `zai-org/GLM-5` | +| Z.ai | GLM 4.7 | `zai-org/GLM-4.7` | +| Z.ai | GLM 4.6 | `zai-org/GLM-4.6` | + +## DPO/Preference Training + +Same models as LoRA/Full fine-tuning. Additional parameters: +- `training_method`: `"dpo"` +- `dpo_beta`: 0.05-0.9 (default 0.1) +- DPO context lengths are generally half of SFT context lengths + +## BYOM (Bring Your Own Model) + +Fine-tune any CausalLM model from HuggingFace Hub: + +```python +job = client.fine_tuning.create( + model="Qwen/Qwen3-4B", # Base template (infrastructure config) + from_hf_model="my-org/my-custom-model", # Your actual model + training_file=file_id, + hf_api_token="hf_xxx", # Optional, for private repos +) +``` + +Important: The `model` parameter (base template) should have a similar architecture, size, and +sequence length to the `from_hf_model` for best results. + diff --git a/plugins/togetherai/skills/together-fine-tuning/scripts/dpo_workflow.py b/plugins/togetherai/skills/together-fine-tuning/scripts/dpo_workflow.py new file mode 100644 index 00000000..36662494 --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/scripts/dpo_workflow.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +Together AI Fine-Tuning -- DPO Preference Workflow (v2 SDK) + +End-to-end: SFT first, then DPO preference tuning from checkpoint. +Best practice is to run SFT before DPO for better model quality. + +Usage: + python dpo_workflow.py + python dpo_workflow.py --sft-training-file sft.jsonl --dpo-training-file dpo.jsonl + python dpo_workflow.py --model meta-llama/Llama-3.2-3B-Instruct + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import json +import tempfile +import time +from pathlib import Path + +from together import Together + +client = Together() + + +def wait_for_file_ready(file_id: str, poll_interval: int = 5) -> None: + """Block until server-side fine-tuning validation finishes for ``file_id``.""" + while True: + meta = client.files.retrieve(file_id) + if meta.processing_status == "COMPLETED": + return + if meta.processing_status == "INVALID_FORMAT": + raise ValueError( + f"file {file_id} is not suitable for fine-tuning: {meta.validation_report}" + ) + if meta.processing_status == "FAILED": + raise RuntimeError( + f"file {file_id} processing did not complete: {meta.validation_report}" + ) + time.sleep(poll_interval) + + +def sample_sft_data() -> list[dict]: + """Return a small SFT dataset for the DPO warm-up stage.""" + return [ + { + "messages": [ + {"role": "user", "content": "Explain quantum computing simply."}, + { + "role": "assistant", + "content": ( + "Quantum computing uses quantum bits (qubits) that can represent " + "0 and 1 simultaneously through superposition. This allows quantum " + "computers to explore many solutions at once for certain problems." + ), + }, + ] + }, + { + "messages": [ + {"role": "user", "content": "What is machine learning?"}, + { + "role": "assistant", + "content": ( + "Machine learning is a branch of AI where systems learn patterns " + "from data instead of being explicitly programmed. Models improve " + "their predictions as they see more examples." + ), + }, + ] + }, + ] + + +def sample_dpo_data() -> list[dict]: + """Return a small preference dataset for DPO training.""" + return [ + { + "input": {"messages": [{"role": "user", "content": "Explain quantum computing simply."}]}, + "preferred_output": [ + { + "role": "assistant", + "content": ( + "Quantum computing uses quantum bits (qubits) that can be in " + "superposition -- representing both 0 and 1 at once. This enables " + "solving certain problems exponentially faster than classical computers." + ), + } + ], + "non_preferred_output": [ + { + "role": "assistant", + "content": "It's just faster computers that use quantum stuff.", + } + ], + }, + { + "input": {"messages": [{"role": "user", "content": "What is machine learning?"}]}, + "preferred_output": [ + { + "role": "assistant", + "content": ( + "Machine learning is a branch of AI where systems learn patterns from " + "data rather than following explicit rules. Models are trained on examples " + "and progressively improve their accuracy on new, unseen data." + ), + } + ], + "non_preferred_output": [ + { + "role": "assistant", + "content": "ML means computers learn things.", + } + ], + }, + ] + + +def create_temp_dataset(rows: list[dict]) -> Path: + """Write JSONL rows to a temporary file.""" + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as temp_file: + for row in rows: + temp_file.write(json.dumps(row) + "\n") + return Path(temp_file.name) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI DPO fine-tuning workflow") + parser.add_argument("--sft-training-file", help="Path to an SFT training JSONL file") + parser.add_argument("--dpo-training-file", help="Path to a DPO preference JSONL file") + parser.add_argument( + "--model", + default="meta-llama/Llama-3.2-3B-Instruct", + help="Base model for both SFT and DPO", + ) + parser.add_argument("--sft-suffix", default="sft-step", help="Suffix for the SFT warm-up job") + parser.add_argument("--dpo-suffix", default="dpo-step", help="Suffix for the DPO job") + parser.add_argument("--sft-epochs", type=int, default=3, help="Epochs for the SFT warm-up job") + parser.add_argument("--dpo-epochs", type=int, default=2, help="Epochs for the DPO job") + parser.add_argument("--learning-rate", type=float, default=1e-5, help="Learning rate for the SFT warm-up job") + parser.add_argument("--dpo-beta", type=float, default=0.2, help="DPO beta value") + parser.add_argument("--poll-interval", type=int, default=30, help="Seconds between status checks") + parser.add_argument( + "--test-prompt", + default="Explain quantum computing simply.", + help="Prompt to send to the final fine-tuned model", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + sft_path: Path | None = None + dpo_path: Path | None = None + sft_upload_path = args.sft_training_file + dpo_upload_path = args.dpo_training_file + + if sft_upload_path is None: + sft_path = create_temp_dataset(sample_sft_data()) + sft_upload_path = str(sft_path) + if dpo_upload_path is None: + dpo_path = create_temp_dataset(sample_dpo_data()) + dpo_upload_path = str(dpo_path) + + # --- 3. Upload both files --- + try: + sft_file = client.files.upload(file=sft_upload_path, purpose="fine-tune", check=True) + dpo_file = client.files.upload(file=dpo_upload_path, purpose="fine-tune", check=True) + finally: + if sft_path is not None: + sft_path.unlink(missing_ok=True) + if dpo_path is not None: + dpo_path.unlink(missing_ok=True) + print(f"SFT file: {sft_file.id}") + print(f"DPO file: {dpo_file.id}") + + # Wait for server-side validation on both files before starting training. + print("Waiting for server-side validation...") + wait_for_file_ready(sft_file.id) + wait_for_file_ready(dpo_file.id) + print("Files ready for fine-tuning.") + + # --- 4. Step 1: Run SFT job first --- + print("\n--- Step 1: SFT Training ---") + sft_job = client.fine_tuning.create( + training_file=sft_file.id, + model=args.model, + lora=True, + n_epochs=args.sft_epochs, + learning_rate=args.learning_rate, + suffix=args.sft_suffix, + ) + print(f"SFT job: {sft_job.id}") + + while True: + status = client.fine_tuning.retrieve(id=sft_job.id) + print(f" SFT status: {status.status}") + if status.status == "completed": + print(f" SFT output: {status.x_model_output_name}") + break + if status.status in ("failed", "cancelled"): + print(f"SFT failed: {status.status}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + # --- 5. Step 2: Run DPO from SFT checkpoint --- + print("\n--- Step 2: DPO Training (from SFT checkpoint) ---") + dpo_job = client.fine_tuning.create( + training_file=dpo_file.id, + from_checkpoint=sft_job.id, + model=args.model, + training_method="dpo", + dpo_beta=args.dpo_beta, + lora=True, + n_epochs=args.dpo_epochs, + suffix=args.dpo_suffix, + ) + print(f"DPO job: {dpo_job.id}") + + while True: + status = client.fine_tuning.retrieve(id=dpo_job.id) + print(f" DPO status: {status.status}") + if status.status == "completed": + print(f" DPO output: {status.x_model_output_name}") + break + if status.status in ("failed", "cancelled"): + print(f"DPO failed: {status.status}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + # --- 6. Deploy and test the DPO-tuned model --- + print("\n--- Deploying DPO-tuned model ---") + output_model = status.x_model_output_name + endpoint = client.endpoints.create( + display_name="DPO Fine-tuned Model", + model=output_model, + hardware="4x_nvidia_h100_80gb_sxm", + autoscaling={"min_replicas": 1, "max_replicas": 1}, + ) + print(f"Created endpoint: {endpoint.id}") + + while True: + ep = client.endpoints.retrieve(endpoint.id) + print(f" Endpoint state: {ep.state}") + if ep.state == "STARTED": + break + if ep.state in ("FAILED", "STOPPED"): + print(f"Endpoint {ep.state}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + print("\n--- Testing DPO-tuned model ---") + response = client.chat.completions.create( + model=endpoint.name, + messages=[{"role": "user", "content": args.test_prompt}], + max_tokens=256, + ) + print(f"Response: {response.choices[0].message.content}") + print(f"\nEndpoint is running. Delete it when done to avoid charges:") + print(f" client.endpoints.delete(\"{endpoint.id}\")") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-fine-tuning/scripts/finetune_workflow.py b/plugins/togetherai/skills/together-fine-tuning/scripts/finetune_workflow.py new file mode 100644 index 00000000..4aacde03 --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/scripts/finetune_workflow.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +""" +Together AI Fine-Tuning -- Full Workflow (v2 SDK) + +End-to-end: prepare data -> upload -> create LoRA job -> monitor -> deploy. + +Usage: + python finetune_workflow.py --training-file train.jsonl --model meta-llama/Meta-Llama-3.1-8B-Instruct-Reference --i-understand-costs + python finetune_workflow.py --i-understand-costs + python finetune_workflow.py --training-file train.jsonl --deploy --i-understand-costs + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import json +import tempfile +import time +from pathlib import Path + +from together import Together + +client = Together() + + +def wait_for_file_ready(file_id: str, poll_interval: int = 5) -> None: + """Block until server-side fine-tuning validation finishes for ``file_id``. + + Raises ``ValueError`` if the dataset is rejected (``INVALID_FORMAT``) and + ``RuntimeError`` for any other terminal failure. + """ + while True: + meta = client.files.retrieve(file_id) + if meta.processing_status == "COMPLETED": + return + if meta.processing_status == "INVALID_FORMAT": + raise ValueError( + f"file {file_id} is not suitable for fine-tuning: {meta.validation_report}" + ) + if meta.processing_status == "FAILED": + raise RuntimeError( + f"file {file_id} processing did not complete: {meta.validation_report}" + ) + time.sleep(poll_interval) + + +def sample_training_data() -> list[dict]: + """Return a small conversational dataset for demonstration.""" + return [ + { + "messages": [ + {"role": "system", "content": "You are a helpful customer support agent."}, + {"role": "user", "content": "How do I reset my password?"}, + { + "role": "assistant", + "content": ( + "Go to Settings > Security > Reset Password. " + "You'll receive a confirmation email within 5 minutes." + ), + }, + ] + }, + { + "messages": [ + {"role": "system", "content": "You are a helpful customer support agent."}, + {"role": "user", "content": "What are your business hours?"}, + { + "role": "assistant", + "content": ( + "We're available Monday-Friday, 9 AM - 6 PM EST. " + "Weekend support is available via email." + ), + }, + ] + }, + { + "messages": [ + {"role": "system", "content": "You are a helpful customer support agent."}, + {"role": "user", "content": "How do I cancel my subscription?"}, + { + "role": "assistant", + "content": ( + "Go to Settings > Billing > Cancel Subscription. " + "Your access continues until the end of the billing period." + ), + }, + ] + }, + ] + + +def create_temp_dataset(rows: list[dict]) -> Path: + """Write JSONL rows to a temporary file.""" + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as temp_file: + for example in rows: + temp_file.write(json.dumps(example) + "\n") + return Path(temp_file.name) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI fine-tuning workflow") + parser.add_argument("--training-file", help="Path to a training JSONL file") + parser.add_argument( + "--model", + default="meta-llama/Meta-Llama-3.1-8B-Instruct-Reference", + help="Base model to fine-tune", + ) + parser.add_argument("--suffix", default="support-bot-v1", help="Suffix for the fine-tuned model") + parser.add_argument("--n-epochs", type=int, default=3, help="Number of training epochs") + parser.add_argument("--learning-rate", type=float, default=1e-5, help="Training learning rate") + parser.add_argument("--hardware", default="4x_nvidia_h100_80gb_sxm", help="Hardware for optional deployment") + parser.add_argument("--display-name", default="Support Bot v1", help="Display name for the endpoint") + parser.add_argument( + "--poll-interval", + type=int, + default=30, + help="Seconds between fine-tuning status checks", + ) + parser.add_argument( + "--deploy", + action="store_true", + help="Deploy the completed fine-tuned model to a dedicated endpoint", + ) + parser.add_argument( + "--i-understand-costs", + action="store_true", + help="Required acknowledgement before uploading data, starting training, or deploying endpoints", + ) + parser.add_argument( + "--test-prompt", + default="How do I update my billing info?", + help="Prompt to use when testing the deployed model", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if not args.i_understand_costs: + raise SystemExit( + "This script uploads data and starts a paid fine-tuning job. " + "Pass --i-understand-costs after reviewing the target model, data, and expected spend." + ) + + data_path: Path | None = None + upload_path = args.training_file + if upload_path is None: + training_data = sample_training_data() + data_path = create_temp_dataset(training_data) + upload_path = str(data_path) + print(f"Wrote {len(training_data)} examples to {data_path}") + + # --- 2. Upload training file with validation enabled --- + try: + file_response = client.files.upload(file=upload_path, purpose="fine-tune", check=True) + finally: + if data_path is not None: + data_path.unlink(missing_ok=True) + file_id = file_response.id + print(f"Uploaded file: {file_id}") + + # Wait for server-side validation before spending tokens on training. + print("Waiting for server-side validation...") + wait_for_file_ready(file_id) + print("File ready for fine-tuning.") + + # --- 3. Create LoRA fine-tuning job --- + job = client.fine_tuning.create( + training_file=file_id, + model=args.model, + n_epochs=args.n_epochs, + learning_rate=args.learning_rate, + lora=True, + suffix=args.suffix, + ) + print(f"Created fine-tuning job: {job.id}") + + # --- 4. Monitor training --- + while True: + status = client.fine_tuning.retrieve(id=job.id) + print(f" Status: {status.status}") + + if status.status == "completed": + print("\nTraining complete!") + print(f" Output model: {status.x_model_output_name}") + break + if status.status in ("failed", "cancelled"): + print(f"Job ended: {status.status}") + raise SystemExit(1) + + time.sleep(args.poll_interval) + + # --- 5. List training events --- + events = client.fine_tuning.list_events(id=job.id) + for event in events.data: + print(f" [{event.created_at}] {event.message}") + + if not args.deploy: + print("\nTraining complete. Deployment skipped; pass --deploy to create a dedicated endpoint.") + return + + # --- 6. Deploy as a Dedicated Endpoint --- + output_model = status.x_model_output_name + endpoint = client.endpoints.create( + display_name=args.display_name, + model=output_model, + hardware=args.hardware, + autoscaling={"min_replicas": 1, "max_replicas": 1}, + ) + print(f"\nCreated endpoint: {endpoint.id}") + + # Wait for the endpoint to be ready before querying + while True: + ep = client.endpoints.retrieve(endpoint.id) + print(f" Endpoint state: {ep.state}") + if ep.state == "STARTED": + break + if ep.state in ("FAILED", "STOPPED"): + print(f"Endpoint {ep.state}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + # --- 7. Query the fine-tuned model via the endpoint name --- + response = client.chat.completions.create( + model=endpoint.name, + messages=[ + {"role": "system", "content": "You are a helpful customer support agent."}, + {"role": "user", "content": args.test_prompt}, + ], + max_tokens=256, + ) + print(f"\nFine-tuned model response: {response.choices[0].message.content}") + print(f"\nEndpoint is running. Delete it when done to avoid charges:") + print(f" client.endpoints.delete(\"{endpoint.id}\")") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-fine-tuning/scripts/function_calling_finetune.py b/plugins/togetherai/skills/together-fine-tuning/scripts/function_calling_finetune.py new file mode 100644 index 00000000..a088bedb --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/scripts/function_calling_finetune.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +""" +Together AI Fine-Tuning -- Function Calling Fine-Tuning (v2 SDK) + +Prepare function calling training data, upload, fine-tune, and test. + +Usage: + python function_calling_finetune.py + python function_calling_finetune.py --training-file tools.jsonl --model Qwen/Qwen3-8B + python function_calling_finetune.py --suffix fc-bot-v2 + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import json +import tempfile +import time +from pathlib import Path + +from together import Together + +client = Together() + + +def wait_for_file_ready(file_id: str, poll_interval: int = 5) -> None: + """Block until server-side fine-tuning validation finishes for ``file_id``.""" + while True: + meta = client.files.retrieve(file_id) + if meta.processing_status == "COMPLETED": + return + if meta.processing_status == "INVALID_FORMAT": + raise ValueError( + f"file {file_id} is not suitable for fine-tuning: {meta.validation_report}" + ) + if meta.processing_status == "FAILED": + raise RuntimeError( + f"file {file_id} processing did not complete: {meta.validation_report}" + ) + time.sleep(poll_interval) + + +def build_tools() -> list[dict]: + """Return sample tool definitions used by the demo dataset and test prompt.""" + return [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name, e.g. San Francisco", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["city"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_restaurants", + "description": "Search for restaurants in a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"}, + "cuisine": {"type": "string", "description": "Cuisine type"}, + }, + "required": ["city"], + }, + }, + }, + ] + + +def sample_training_data(tools: list[dict]) -> list[dict]: + """Return a small function-calling fine-tuning dataset.""" + return [ + { + "tools": tools, + "messages": [ + {"role": "user", "content": "What's the weather like in San Francisco?"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city": "San Francisco", "unit": "fahrenheit"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "call_1", + "content": '{"temp": 65, "condition": "foggy", "unit": "fahrenheit"}', + }, + { + "role": "assistant", + "content": "It's currently 65F and foggy in San Francisco.", + }, + ], + }, + { + "tools": tools, + "messages": [ + {"role": "user", "content": "Find me Italian restaurants in NYC"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_2", + "type": "function", + "function": { + "name": "search_restaurants", + "arguments": '{"city": "New York", "cuisine": "Italian"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "call_2", + "content": '{"restaurants": ["Carbone", "L\'Artusi", "Via Carota"]}', + }, + { + "role": "assistant", + "content": ( + "Here are some top Italian restaurants in NYC: " + "Carbone, L'Artusi, and Via Carota." + ), + }, + ], + }, + { + "tools": tools, + "messages": [ + { + "role": "user", + "content": "What's the weather in Chicago and find me restaurants there?", + }, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call_3", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city": "Chicago", "unit": "fahrenheit"}', + }, + }, + { + "id": "call_4", + "type": "function", + "function": { + "name": "search_restaurants", + "arguments": '{"city": "Chicago"}', + }, + }, + ], + }, + { + "role": "tool", + "tool_call_id": "call_3", + "content": '{"temp": 45, "condition": "windy", "unit": "fahrenheit"}', + }, + { + "role": "tool", + "tool_call_id": "call_4", + "content": '{"restaurants": ["Alinea", "Girl & The Goat", "Au Cheval"]}', + }, + { + "role": "assistant", + "content": ( + "Chicago is currently 45F and windy. For dining, I recommend " + "Alinea, Girl & The Goat, or Au Cheval." + ), + }, + ], + }, + ] + + +def create_temp_dataset(rows: list[dict]) -> Path: + """Write JSONL rows to a temporary file.""" + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as temp_file: + for example in rows: + temp_file.write(json.dumps(example) + "\n") + return Path(temp_file.name) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI function-calling fine-tuning workflow") + parser.add_argument("--training-file", help="Path to a training JSONL file") + parser.add_argument("--model", default="Qwen/Qwen3-8B", help="Base model to fine-tune") + parser.add_argument("--suffix", default="fc-bot-v1", help="Suffix for the fine-tuned model") + parser.add_argument("--n-epochs", type=int, default=3, help="Number of training epochs") + parser.add_argument("--learning-rate", type=float, default=1e-5, help="Training learning rate") + parser.add_argument("--poll-interval", type=int, default=30, help="Seconds between status checks") + parser.add_argument( + "--test-prompt", + default="What's the weather in Boston?", + help="Prompt to use when probing the fine-tuned model", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + tools = build_tools() + data_path: Path | None = None + upload_path = args.training_file + if upload_path is None: + training_data = sample_training_data(tools) + data_path = create_temp_dataset(training_data) + upload_path = str(data_path) + print(f"Wrote {len(training_data)} function calling examples to {data_path}") + + # --- 2. Upload --- + try: + file_resp = client.files.upload(file=upload_path, purpose="fine-tune", check=True) + finally: + if data_path is not None: + data_path.unlink(missing_ok=True) + print(f"Uploaded file: {file_resp.id}") + + # Wait for server-side validation before starting training. + print("Waiting for server-side validation...") + wait_for_file_ready(file_resp.id) + print("File ready for fine-tuning.") + + # --- 3. Start LoRA fine-tuning --- + job = client.fine_tuning.create( + training_file=file_resp.id, + model=args.model, + lora=True, + n_epochs=args.n_epochs, + learning_rate=args.learning_rate, + suffix=args.suffix, + ) + print(f"Created job: {job.id}") + + # --- 4. Monitor --- + while True: + status = client.fine_tuning.retrieve(id=job.id) + print(f" Status: {status.status}") + if status.status == "completed": + print(f"\nTraining complete! Output: {status.x_model_output_name}") + break + if status.status in ("failed", "cancelled"): + print(f"Job ended: {status.status}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + # --- 5. Deploy and test function calling with fine-tuned model --- + print("\n--- Deploying fine-tuned model ---") + output_model = status.x_model_output_name + endpoint = client.endpoints.create( + display_name="Function Calling Fine-tuned", + model=output_model, + hardware="4x_nvidia_h100_80gb_sxm", + autoscaling={"min_replicas": 1, "max_replicas": 1}, + ) + print(f"Created endpoint: {endpoint.id}") + + while True: + ep = client.endpoints.retrieve(endpoint.id) + print(f" Endpoint state: {ep.state}") + if ep.state == "STARTED": + break + if ep.state in ("FAILED", "STOPPED"): + print(f"Endpoint {ep.state}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + print("\n--- Testing function calling ---") + response = client.chat.completions.create( + model=endpoint.name, + messages=[{"role": "user", "content": args.test_prompt}], + tools=tools, + ) + + tool_calls = response.choices[0].message.tool_calls + if tool_calls: + for tool_call in tool_calls: + print(f" Tool call: {tool_call.function.name}({tool_call.function.arguments})") + else: + print(f" Response: {response.choices[0].message.content}") + print(f"\nEndpoint is running. Delete it when done to avoid charges:") + print(f" client.endpoints.delete(\"{endpoint.id}\")") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-fine-tuning/scripts/reasoning_finetune.py b/plugins/togetherai/skills/together-fine-tuning/scripts/reasoning_finetune.py new file mode 100644 index 00000000..90257f0b --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/scripts/reasoning_finetune.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +""" +Together AI Fine-Tuning -- Reasoning Fine-Tuning (v2 SDK) + +Prepare chain-of-thought training data with reasoning fields, upload, +fine-tune a reasoning model, and test inference with reasoning output. + +Reasoning datasets use conversational format where assistant messages +include a `reasoning` (or `reasoning_content`) field containing the +model's chain of thought, and a `content` field for the final answer. + +Supported models: Qwen3.5 family (0.8B-397B), Qwen3 family (0.6B-235B), +GLM-5.1, GLM-5, GLM-4.7, GLM-4.6, Qwen3-Next-80B-A3B-Thinking. + +Usage: + python reasoning_finetune.py + python reasoning_finetune.py --training-file reasoning.jsonl --model Qwen/Qwen3-8B + python reasoning_finetune.py --test-prompt "What is 30% of 250?" + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import json +import tempfile +import time +from pathlib import Path + +from together import Together + +client = Together() + + +def wait_for_file_ready(file_id: str, poll_interval: int = 5) -> None: + """Block until server-side fine-tuning validation finishes for ``file_id``.""" + while True: + meta = client.files.retrieve(file_id) + if meta.processing_status == "COMPLETED": + return + if meta.processing_status == "INVALID_FORMAT": + raise ValueError( + f"file {file_id} is not suitable for fine-tuning: {meta.validation_report}" + ) + if meta.processing_status == "FAILED": + raise RuntimeError( + f"file {file_id} processing did not complete: {meta.validation_report}" + ) + time.sleep(poll_interval) + + +def sample_training_data() -> list[dict]: + """Return a small reasoning dataset.""" + return [ + { + "messages": [ + {"role": "user", "content": "What is 15% of 240?"}, + { + "role": "assistant", + "reasoning": ( + "I need to calculate 15% of 240.\n" + "15% means 15/100 = 0.15\n" + "0.15 * 240 = 36\n" + "So 15% of 240 is 36." + ), + "content": "15% of 240 is **36**.", + }, + ] + }, + { + "messages": [ + { + "role": "user", + "content": "If a train travels at 60 mph for 2.5 hours, how far does it go?", + }, + { + "role": "assistant", + "reasoning": ( + "Distance = speed * time\n" + "Speed = 60 mph\n" + "Time = 2.5 hours\n" + "Distance = 60 * 2.5 = 150 miles" + ), + "content": "The train travels **150 miles**.", + }, + ] + }, + { + "messages": [ + { + "role": "user", + "content": "A store has a 20% off sale. An item costs $80. What is the sale price?", + }, + { + "role": "assistant", + "reasoning": ( + "Original price: $80\n" + "Discount: 20% of $80 = 0.20 * 80 = $16\n" + "Sale price: $80 - $16 = $64" + ), + "content": "The sale price is **$64**.", + }, + ] + }, + { + "messages": [ + { + "role": "user", + "content": ( + "There are 5 red balls and 3 blue balls in a bag. " + "What is the probability of drawing a blue ball?" + ), + }, + { + "role": "assistant", + "reasoning": ( + "Total balls = 5 red + 3 blue = 8\n" + "Blue balls = 3\n" + "P(blue) = 3/8 = 0.375 = 37.5%" + ), + "content": "The probability of drawing a blue ball is **3/8** (37.5%).", + }, + ] + }, + { + "messages": [ + {"role": "user", "content": "Solve for x: 3x + 7 = 22"}, + { + "role": "assistant", + "reasoning": ( + "3x + 7 = 22\n" + "Subtract 7 from both sides: 3x = 15\n" + "Divide both sides by 3: x = 5\n" + "Check: 3(5) + 7 = 15 + 7 = 22" + ), + "content": "**x = 5**", + }, + ] + }, + ] + + +def create_temp_dataset(rows: list[dict]) -> Path: + """Write JSONL rows to a temporary file.""" + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as temp_file: + for example in rows: + temp_file.write(json.dumps(example) + "\n") + return Path(temp_file.name) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI reasoning fine-tuning workflow") + parser.add_argument("--training-file", help="Path to a reasoning training JSONL file") + parser.add_argument("--model", default="Qwen/Qwen3-8B", help="Reasoning-capable base model") + parser.add_argument("--suffix", default="reasoning-math-v1", help="Suffix for the fine-tuned model") + parser.add_argument("--n-epochs", type=int, default=3, help="Number of training epochs") + parser.add_argument("--learning-rate", type=float, default=1e-5, help="Training learning rate") + parser.add_argument("--poll-interval", type=int, default=30, help="Seconds between status checks") + parser.add_argument( + "--test-prompt", + default="What is 25% of 360?", + help="Prompt to use when testing the fine-tuned model", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + data_path: Path | None = None + upload_path = args.training_file + if upload_path is None: + training_data = sample_training_data() + data_path = create_temp_dataset(training_data) + upload_path = str(data_path) + print(f"Wrote {len(training_data)} reasoning examples to {data_path}") + + # --- 2. Upload --- + try: + file_resp = client.files.upload(file=upload_path, purpose="fine-tune", check=True) + finally: + if data_path is not None: + data_path.unlink(missing_ok=True) + print(f"Uploaded file: {file_resp.id}") + + # Wait for server-side validation before starting training. + print("Waiting for server-side validation...") + wait_for_file_ready(file_resp.id) + print("File ready for fine-tuning.") + + # --- 3. Start LoRA fine-tuning on a reasoning-capable model --- + job = client.fine_tuning.create( + training_file=file_resp.id, + model=args.model, + lora=True, + n_epochs=args.n_epochs, + learning_rate=args.learning_rate, + suffix=args.suffix, + ) + print(f"Created reasoning fine-tuning job: {job.id}") + + # --- 4. Monitor --- + while True: + status = client.fine_tuning.retrieve(id=job.id) + print(f" Status: {status.status}") + if status.status == "completed": + print(f"\nTraining complete! Output: {status.x_model_output_name}") + break + if status.status in ("failed", "cancelled"): + print(f"Job ended: {status.status}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + # --- 5. Deploy and test reasoning inference --- + print("\n--- Deploying fine-tuned model ---") + output_model = status.x_model_output_name + endpoint = client.endpoints.create( + display_name="Reasoning Fine-tuned", + model=output_model, + hardware="4x_nvidia_h100_80gb_sxm", + autoscaling={"min_replicas": 1, "max_replicas": 1}, + ) + print(f"Created endpoint: {endpoint.id}") + + while True: + ep = client.endpoints.retrieve(endpoint.id) + print(f" Endpoint state: {ep.state}") + if ep.state == "STARTED": + break + if ep.state in ("FAILED", "STOPPED"): + print(f"Endpoint {ep.state}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + print("\n--- Testing reasoning inference ---") + stream = client.chat.completions.create( + model=endpoint.name, + messages=[{"role": "user", "content": args.test_prompt}], + stream=True, + ) + + reasoning_text = "" + content_text = "" + for chunk in stream: + if chunk.choices: + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning") and delta.reasoning: + reasoning_text += delta.reasoning + if hasattr(delta, "content") and delta.content: + content_text += delta.content + + print(f"Reasoning: {reasoning_text}") + print(f"Answer: {content_text}") + print(f"\nEndpoint is running. Delete it when done to avoid charges:") + print(f" client.endpoints.delete(\"{endpoint.id}\")") + + # --- 6. (Optional) Preference fine-tuning for reasoning --- + dpo_example = { + "input": {"messages": [{"role": "user", "content": "What is 15% of 240?"}]}, + "preferred_output": [ + { + "role": "assistant", + "reasoning": "15% means 15/100 = 0.15\n0.15 * 240 = 36", + "content": "15% of 240 is **36**.", + } + ], + "non_preferred_output": [ + { + "role": "assistant", + "reasoning": "15% of 240... let me guess...", + "content": "About 30.", + } + ], + } + print(f"\nDPO reasoning example format:\n{json.dumps(dpo_example, indent=2)}") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-fine-tuning/scripts/vlm_finetune.py b/plugins/togetherai/skills/together-fine-tuning/scripts/vlm_finetune.py new file mode 100644 index 00000000..9224ee20 --- /dev/null +++ b/plugins/togetherai/skills/together-fine-tuning/scripts/vlm_finetune.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +Together AI Fine-Tuning -- VLM (Vision-Language) Fine-Tuning (v2 SDK) + +Prepare image+text training data with base64-encoded images, upload, +and fine-tune a vision-language model. + +Usage: + python vlm_finetune.py + python vlm_finetune.py --training-file vlm.jsonl --model Qwen/Qwen3-VL-8B-Instruct + python vlm_finetune.py --sample-image-url https://example.com/image.jpg + +Requires: + uv pip install "together>=2.0.0" requests + export TOGETHER_API_KEY=your_key +""" + +import argparse +import base64 +import json +import tempfile +import time +from pathlib import Path + +import requests +from together import Together + +client = Together() + + +def wait_for_file_ready(file_id: str, poll_interval: int = 5) -> None: + """Block until server-side fine-tuning validation finishes for ``file_id``.""" + while True: + meta = client.files.retrieve(file_id) + if meta.processing_status == "COMPLETED": + return + if meta.processing_status == "INVALID_FORMAT": + raise ValueError( + f"file {file_id} is not suitable for fine-tuning: {meta.validation_report}" + ) + if meta.processing_status == "FAILED": + raise RuntimeError( + f"file {file_id} processing did not complete: {meta.validation_report}" + ) + time.sleep(poll_interval) + + +def url_to_base64(url: str, mime_type: str = "image/jpeg") -> str: + """Download an image URL and return a base64 data URI.""" + response = requests.get(url, timeout=60) + response.raise_for_status() + encoded = base64.b64encode(response.content).decode("utf-8") + return f"data:{mime_type};base64,{encoded}" + + +def sample_training_data(image_data_uri: str) -> list[dict]: + """Return a small VLM fine-tuning dataset.""" + return [ + { + "messages": [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a helpful vision assistant."}], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "How many items are in this image?"}, + {"type": "image_url", "image_url": {"url": image_data_uri}}, + ], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": "There are 3 items in the image."}], + }, + ] + }, + { + "messages": [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a helpful vision assistant."}], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe what you see in this image."}, + {"type": "image_url", "image_url": {"url": image_data_uri}}, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a desk with a laptop, a coffee mug, and a notebook.", + } + ], + }, + ] + }, + ] + + +def create_temp_dataset(rows: list[dict]) -> Path: + """Write JSONL rows to a temporary file.""" + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as temp_file: + for example in rows: + temp_file.write(json.dumps(example) + "\n") + return Path(temp_file.name) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI VLM fine-tuning workflow") + parser.add_argument("--training-file", help="Path to a VLM training JSONL file") + parser.add_argument("--model", default="Qwen/Qwen3-VL-8B-Instruct", help="Base VLM to fine-tune") + parser.add_argument("--suffix", default="vlm-v1", help="Suffix for the fine-tuned model") + parser.add_argument("--n-epochs", type=int, default=3, help="Number of training epochs") + parser.add_argument("--learning-rate", type=float, default=1e-5, help="Training learning rate") + parser.add_argument( + "--train-vision", + action="store_true", + help="Also train the vision encoder instead of text-only LoRA updates", + ) + parser.add_argument("--poll-interval", type=int, default=30, help="Seconds between status checks") + parser.add_argument("--sample-image-url", help="Optional image URL to embed into the bundled sample dataset") + parser.add_argument( + "--test-prompt", + default="What do you see in this image?", + help="Prompt to use when testing the fine-tuned model", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + data_path: Path | None = None + upload_path = args.training_file + sample_image = ( + url_to_base64(args.sample_image_url) + if args.sample_image_url + else "data:image/jpeg;base64,/9j/4AAQSkZJRg==" + ) + if upload_path is None: + vlm_training_data = sample_training_data(sample_image) + data_path = create_temp_dataset(vlm_training_data) + upload_path = str(data_path) + print(f"Wrote {len(vlm_training_data)} VLM examples to {data_path}") + + # --- 2. Upload --- + try: + file_resp = client.files.upload(file=upload_path, purpose="fine-tune", check=True) + finally: + if data_path is not None: + data_path.unlink(missing_ok=True) + print(f"Uploaded file: {file_resp.id}") + + # Wait for server-side validation before starting training. + print("Waiting for server-side validation...") + wait_for_file_ready(file_resp.id) + print("File ready for fine-tuning.") + + # --- 3. Start VLM LoRA fine-tuning --- + job = client.fine_tuning.create( + training_file=file_resp.id, + model=args.model, + lora=True, + train_vision=args.train_vision, + n_epochs=args.n_epochs, + learning_rate=args.learning_rate, + suffix=args.suffix, + ) + print(f"Created VLM fine-tuning job: {job.id}") + + # --- 4. Monitor --- + while True: + status = client.fine_tuning.retrieve(id=job.id) + print(f" Status: {status.status}") + if status.status == "completed": + print(f"\nVLM training complete! Output: {status.x_model_output_name}") + break + if status.status in ("failed", "cancelled"): + print(f"Job ended: {status.status}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + # --- 5. Deploy and test VLM inference --- + print("\n--- Deploying fine-tuned VLM ---") + output_model = status.x_model_output_name + endpoint = client.endpoints.create( + display_name="VLM Fine-tuned", + model=output_model, + hardware="4x_nvidia_h100_80gb_sxm", + autoscaling={"min_replicas": 1, "max_replicas": 1}, + ) + print(f"Created endpoint: {endpoint.id}") + + while True: + ep = client.endpoints.retrieve(endpoint.id) + print(f" Endpoint state: {ep.state}") + if ep.state == "STARTED": + break + if ep.state in ("FAILED", "STOPPED"): + print(f"Endpoint {ep.state}") + raise SystemExit(1) + time.sleep(args.poll_interval) + + print("\n--- Testing VLM inference ---") + response = client.chat.completions.create( + model=endpoint.name, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": args.test_prompt}, + {"type": "image_url", "image_url": {"url": sample_image}}, + ], + } + ], + max_tokens=512, + ) + print(f"VLM response: {response.choices[0].message.content}") + print(f"\nEndpoint is running. Delete it when done to avoid charges:") + print(f" client.endpoints.delete(\"{endpoint.id}\")") + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-gpu-clusters/SKILL.md b/plugins/togetherai/skills/together-gpu-clusters/SKILL.md new file mode 100644 index 00000000..ddeeba27 --- /dev/null +++ b/plugins/togetherai/skills/together-gpu-clusters/SKILL.md @@ -0,0 +1,88 @@ +--- +name: together-gpu-clusters +description: "On-demand and reserved GPU clusters (H100, H200, B200) on Together AI with Kubernetes or Slurm orchestration, shared storage, credential management, and cluster scaling for ML and HPC jobs. Reach for it when the user needs multi-node compute or infrastructure control rather than a managed model endpoint." +--- + +# Together GPU Clusters + +## Overview + +Use Together AI GPU clusters when the user needs infrastructure control instead of a managed +inference product. + +Typical fits: + +- distributed training +- multi-node inference +- HPC or Slurm workloads +- custom Kubernetes jobs +- attached shared storage and cluster lifecycle management + +## When This Skill Wins + +- Provision a cluster and manage it over time +- Choose between on-demand and reserved capacity +- Choose Kubernetes or Slurm as the orchestration layer +- Manage shared volumes and credentials +- Scale up, scale down, or troubleshoot node health + +## Hand Off To Another Skill + +- Use `together-dedicated-endpoints` for managed single-model hosting +- Use `together-dedicated-containers` for containerized inference without owning the full cluster +- Use `together-sandboxes` for short-lived remote Python execution +- Use `together-fine-tuning` for managed training jobs instead of raw cluster operations + +## Quick Routing + +- Cluster creation, scaling, credentials, deletion + - Start with [scripts/manage_cluster.py](scripts/manage_cluster.py) or [scripts/manage_cluster.ts](scripts/manage_cluster.ts) + - Read [references/api-reference.md](references/api-reference.md) +- Shared storage lifecycle + - Use [scripts/manage_storage.py](scripts/manage_storage.py) + - Read [references/api-reference.md](references/api-reference.md) +- Kubernetes vs Slurm operations + - Read [references/cluster-management.md](references/cluster-management.md) +- Troubleshooting node health, PVCs, or scheduling + - Read [references/cluster-management.md](references/cluster-management.md) +- tcloud CLI workflows + - Read [references/tcloud-cli.md](references/tcloud-cli.md) + +## Workflow + +1. Decide whether the workload really needs cluster-level control. +2. Choose on-demand vs reserved billing based on run duration and baseline utilization. +3. Choose Kubernetes vs Slurm based on orchestration requirements and team tooling. +4. Select region, GPU type, driver version, and shared storage plan. +5. Provision first, then layer in access credentials, workload deployment, scaling, and health checks. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Prefer managed products unless the user explicitly needs raw infrastructure control. +- Treat storage lifecycle separately from cluster lifecycle; volumes can outlive clusters. +- When creating a cluster with new shared storage, prefer inline `shared_volume` over creating a volume separately and attaching via `volume_id`. Separately created volumes may land in a different datacenter partition than the cluster, causing a "does not exist in the datacenter" error even when the volume shows as available. +- GPU stock-outs (409 "Out of stock") are common. Always call `list_regions()` first and be prepared to try multiple regions. +- The API requires `cuda_version` and `nvidia_driver_version` as separate fields in addition to the combined `driver_version` string. Pass them via `extra_body` in the Python SDK. +- Credentials retrieval is part of provisioning. Do not stop at cluster creation if the user needs to run workloads immediately. +- Slurm and Kubernetes operational patterns differ materially; read the cluster-management reference before improvising. +- For repeated cluster operations, start from the scripts instead of rebuilding request shapes. +- Slurm startup scripts (worker/login init, worker/controller prolog and epilog, extra `slurm.conf`) are Slinky v1.0 only. A non-zero exit from a worker prolog or epilog drains the node, and calling Slurm commands (`squeue`, `scontrol`, `sacctmgr`) inside any prolog/epilog can deadlock the scheduler. + +## Resource Map + +- Cluster API reference: [references/api-reference.md](references/api-reference.md) +- Operational guide: [references/cluster-management.md](references/cluster-management.md) +- Operational troubleshooting: [references/cluster-management.md](references/cluster-management.md) +- CLI guide: [references/tcloud-cli.md](references/tcloud-cli.md) +- Python cluster management: [scripts/manage_cluster.py](scripts/manage_cluster.py) +- TypeScript cluster management: [scripts/manage_cluster.ts](scripts/manage_cluster.ts) +- Python storage management: [scripts/manage_storage.py](scripts/manage_storage.py) + +## Official Docs + +- [GPU Clusters Overview](https://docs.together.ai/docs/gpu-clusters-overview) +- [GPU Clusters Quickstart](https://docs.together.ai/docs/gpu-clusters-quickstart) +- [Clusters API](https://docs.together.ai/reference/clusters-create) +- [Slurm Startup Scripts](https://docs.together.ai/docs/slurm-startup-scripts) +- [Instant GPU Clusters](https://www.together.ai/instant-gpu-clusters) diff --git a/plugins/togetherai/skills/together-gpu-clusters/references/api-reference.md b/plugins/togetherai/skills/together-gpu-clusters/references/api-reference.md new file mode 100644 index 00000000..23774c21 --- /dev/null +++ b/plugins/togetherai/skills/together-gpu-clusters/references/api-reference.md @@ -0,0 +1,478 @@ +# GPU Clusters API Reference +## Contents + +- [Cluster Endpoints](#cluster-endpoints) +- [Storage Endpoints](#storage-endpoints) +- [Create Cluster](#create-cluster) +- [List Clusters](#list-clusters) +- [Get Cluster](#get-cluster) +- [Update / Scale Cluster](#update-scale-cluster) +- [Delete Cluster](#delete-cluster) +- [List Regions](#list-regions) +- [Create Shared Volume](#create-shared-volume) +- [List Shared Volumes](#list-shared-volumes) +- [Get Shared Volume](#get-shared-volume) +- [Update (Resize) Shared Volume](#update-shared-volume) +- [Delete Shared Volume](#delete-shared-volume) +- [Instance Types](#instance-types) +- [Driver Versions](#driver-versions) +- [Cluster Statuses](#cluster-statuses) +- [Volume Statuses](#volume-statuses) +- [Cluster Response Object](#cluster-response-object) + + +Base URL: `https://api.together.xyz/v1` + +## Cluster Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /compute/clusters` | Create cluster | Provision a new GPU cluster | +| `GET /compute/clusters` | List clusters | List all GPU clusters | +| `GET /compute/clusters/{id}` | Get cluster | Get cluster details | +| `PUT /compute/clusters/{id}` | Update cluster | Scale or change cluster type | +| `DELETE /compute/clusters/{id}` | Delete cluster | Remove a cluster | +| `GET /compute/regions` | List regions | Available regions, GPUs, drivers | + +## Storage Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /compute/clusters/storage/volumes` | Create volume | Create shared storage | +| `GET /compute/clusters/storage/volumes` | List volumes | List all volumes | +| `GET /compute/clusters/storage/volumes/{id}` | Get volume | Get volume details | +| `PUT /compute/clusters/storage/volumes` | Update volume | Resize a volume | +| `DELETE /compute/clusters/storage/volumes/{id}` | Delete volume | Remove a volume | + +## Create Cluster + +The API requires `cuda_version` and `nvidia_driver_version` as separate fields. The SDK +also accepts a combined `driver_version` string, but the two split fields must be present +for the request to succeed. Pass them via `extra_body` in the SDK or directly in REST. + +```python +from together import Together +client = Together() + +cluster = client.beta.clusters.create( + cluster_name="my-gpu-cluster", + region="us-central-8", + gpu_type="H100_SXM", + num_gpus=8, + driver_version="CUDA_12_6_560", + billing_type="ON_DEMAND", + cluster_type="KUBERNETES", + # volume_id="existing-volume-id", # optional: attach existing volume + extra_body={ + "cuda_version": "12.6", + "nvidia_driver_version": "560", + }, +) +print(cluster.cluster_id) +``` + +```typescript +import Together from "together-ai"; +const client = new Together(); + +const cluster = await client.beta.clusters.create({ + cluster_name: "my-gpu-cluster", + region: "us-central-8", + gpu_type: "H100_SXM", + num_gpus: 8, + driver_version: "CUDA_12_6_560", + billing_type: "ON_DEMAND", + cluster_type: "KUBERNETES", + // @ts-expect-error -- required by API but not yet in SDK types + cuda_version: "12.6", + nvidia_driver_version: "560", +}); +console.log(cluster.cluster_id); +``` + +```shell +curl -X POST \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "cluster_name": "my-gpu-cluster", + "region": "us-central-8", + "gpu_type": "H100_SXM", + "num_gpus": 8, + "driver_version": "CUDA_12_6_560", + "cuda_version": "12.6", + "nvidia_driver_version": "560", + "billing_type": "ON_DEMAND", + "cluster_type": "KUBERNETES" + }' \ + https://api.together.xyz/v1/compute/clusters +``` + +```shell +together beta clusters create \ + --name my-gpu-cluster \ + --num-gpus 8 \ + --gpu-type H100_SXM \ + --region us-central-8 \ + --driver-version CUDA_12_6_560 \ + --billing-type ON_DEMAND \ + --cluster-type KUBERNETES +``` + +### Create Request Body + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `cluster_name` | string | Yes | Name of the cluster | +| `region` | string | Yes | Region (use `list_regions()` in Python or `listRegions()` in TypeScript to find valid regions) | +| `gpu_type` | string | Yes | GPU type (see Instance Types below) | +| `num_gpus` | integer | Yes | Number of GPUs (must be a multiple of 8) | +| `driver_version` | string | Yes | Combined driver string, e.g. `CUDA_12_6_560` (see Driver Versions below) | +| `cuda_version` | string | Yes | CUDA version, e.g. `"12.6"` | +| `nvidia_driver_version` | string | Yes | NVIDIA driver version, e.g. `"560"` | +| `billing_type` | string | Yes | `ON_DEMAND` or `RESERVED` | +| `cluster_type` | string | No | `KUBERNETES` (default) or `SLURM` | +| `duration_days` | integer | No | Reservation length in days (only with `RESERVED`) | +| `volume_id` | string | No | Existing shared volume ID to attach | +| `shared_volume` | object | No | Inline volume: `{volume_name, size_tib, region}` | + +## List Clusters + +```python +response = client.beta.clusters.list() +for c in response.clusters: + print(f"{c.cluster_id}: {c.cluster_name} ({c.status}, {c.num_gpus} GPUs)") +``` + +```typescript +const response = await client.beta.clusters.list(); +for (const c of response.clusters) { + console.log(`${c.cluster_id}: ${c.cluster_name} (${c.status})`); +} +``` + +```shell +curl -X GET \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/compute/clusters +``` + +```shell +together beta clusters list +``` + +## Get Cluster + +```python +cluster = client.beta.clusters.retrieve("cluster-id") +print(f"Status: {cluster.status}, GPUs: {cluster.num_gpus}") +``` + +```typescript +const cluster = await client.beta.clusters.retrieve("cluster-id"); +console.log(cluster); +``` + +```shell +curl -X GET \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/compute/clusters/${CLUSTER_ID} +``` + +```shell +together beta clusters retrieve +``` + +## Update / Scale Cluster + +```python +cluster = client.beta.clusters.update( + "cluster-id", + num_gpus=16, + cluster_type="KUBERNETES", +) +``` + +```typescript +const cluster = await client.beta.clusters.update("cluster-id", { + num_gpus: 16, + cluster_type: "KUBERNETES", +}); +``` + +```shell +curl -X PUT \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"num_gpus": 16, "cluster_type": "KUBERNETES"}' \ + https://api.together.xyz/v1/compute/clusters/${CLUSTER_ID} +``` + +```shell +together beta clusters update --num-gpus 16 +together beta clusters update --num-gpus 16 --cluster-type KUBERNETES +``` + +### Update Request Body + +| Field | Type | Description | +|-------|------|-------------| +| `num_gpus` | integer | New GPU count (must be a multiple of 8) | +| `cluster_type` | string | `KUBERNETES` or `SLURM` | + +## Delete Cluster + +```python +client.beta.clusters.delete("cluster-id") +``` + +```typescript +await client.beta.clusters.delete("cluster-id"); +``` + +```shell +curl -X DELETE \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/compute/clusters/${CLUSTER_ID} +``` + +```shell +together beta clusters delete +``` + +## List Regions + +```python +regions = client.beta.clusters.list_regions() +for r in regions.regions: + print(f"{r.name}: {r.supported_instance_types}, drivers: {r.driver_versions}") +``` + +```typescript +const regions = await client.beta.clusters.listRegions(); +console.log(regions); +``` + +```shell +curl -X GET \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/compute/regions +``` + +```shell +together beta clusters list-regions +``` + +## Create Shared Volume + +```python +volume = client.beta.clusters.storage.create( + volume_name="my-shared-volume", + size_tib=2, + region="us-central-8", +) +print(volume.volume_id) +``` + +```typescript +const volume = await client.beta.clusters.storage.create({ + volume_name: "my-shared-volume", + size_tib: 2, + region: "us-central-8", +}); +console.log(volume.volume_id); +``` + +```shell +curl -X POST \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"volume_name": "my-shared-volume", "size_tib": 2, "region": "us-central-8"}' \ + https://api.together.xyz/v1/compute/clusters/storage/volumes +``` + +```shell +together beta clusters storage create \ + --volume-name my-shared-volume \ + --size-tib 2 \ + --region us-central-8 +``` + +### Volume Create Request Body + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `volume_name` | string | Yes | Name of the volume | +| `size_tib` | integer | Yes | Size in tebibytes (TiB) | +| `region` | string | Yes | Region name | + +## List Shared Volumes + +```python +volumes = client.beta.clusters.storage.list() +for v in volumes.volumes: + print(f"{v.volume_id}: {v.volume_name} ({v.size_tib} TiB, {v.status})") +``` + +```typescript +const volumes = await client.beta.clusters.storage.list(); +console.log(volumes); +``` + +```shell +curl -X GET \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/compute/clusters/storage/volumes +``` + +```shell +together beta clusters storage list +``` + +## Get Shared Volume + +```python +volume = client.beta.clusters.storage.retrieve("volume-id") +print(f"{volume.volume_name}: {volume.size_tib} TiB ({volume.status})") +``` + +```typescript +const volume = await client.beta.clusters.storage.retrieve("volume-id"); +console.log(volume); +``` + +```shell +curl -X GET \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/compute/clusters/storage/volumes/${VOLUME_ID} +``` + +```shell +together beta clusters storage retrieve +``` + +## Update (Resize) Shared Volume + +```python +volume = client.beta.clusters.storage.update( + volume_id="volume-id", + size_tib=5, +) +``` + +```typescript +const volume = await client.beta.clusters.storage.update({ + volume_id: "volume-id", + size_tib: 5, +}); +``` + +```shell +curl -X PUT \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"volume_id": "volume-id", "size_tib": 5}' \ + https://api.together.xyz/v1/compute/clusters/storage/volumes +``` + +## Delete Shared Volume + +Volume must not be attached to any cluster. + +```python +client.beta.clusters.storage.delete("volume-id") +``` + +```typescript +await client.beta.clusters.storage.delete("volume-id"); +``` + +```shell +curl -X DELETE \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + https://api.together.xyz/v1/compute/clusters/storage/volumes/${VOLUME_ID} +``` + +```shell +together beta clusters storage delete +``` + +## Instance Types + +| CLI Value | GPU | Memory | Notes | +|-----------|-----|--------|-------| +| `H100_SXM` | NVIDIA H100 | 80GB | InfiniBand networking | +| `H100_SXM_INF` | NVIDIA H100 | 80GB | Inference-optimized, lower IB bandwidth | +| `H200_SXM` | NVIDIA H200 | 141GB | InfiniBand networking | +| `B200_SXM` | NVIDIA B200 | 192GB | InfiniBand networking | +| `L40_PCIE` | NVIDIA L40 | 48GB | PCIe | +| `RTX_6000_PCI` | NVIDIA RTX 6000 | 24GB | PCIe | + +## Driver Versions + +Available CUDA driver versions (check `list_regions()` in Python or `listRegions()` in TypeScript for per-region availability). + +The `list_regions()` response returns driver versions as a list of objects: + +```json +[ + {"cuda_version": "12.6", "nvidia_driver_version": "560"}, + {"cuda_version": "12.4", "nvidia_driver_version": "550"} +] +``` + +The combined `driver_version` string follows the pattern `CUDA_{major}_{minor}_{nvidia}`: + +| `driver_version` | `cuda_version` | `nvidia_driver_version` | +|-------------------|----------------|--------------------------| +| `CUDA_12_4_550` | `12.4` | `550` | +| `CUDA_12_5_555` | `12.5` | `555` | +| `CUDA_12_6_560` | `12.6` | `560` | +| `CUDA_12_6_565` | `12.6` | `565` | +| `CUDA_12_8_570` | `12.8` | `570` | +| `CUDA_12_9_575` | `12.9` | `575` | + +## Cluster Statuses + +| Status | Description | +|--------|-------------| +| `Scheduled` | Cluster creation accepted, awaiting resource allocation | +| `WaitingForControlPlaneNodes` | Control plane provisioning | +| `WaitingForDataPlaneNodes` | Worker nodes provisioning | +| `WaitingForSubnet` | Network setup | +| `WaitingForSharedVolume` | Storage provisioning | +| `InstallingDrivers` | CUDA driver installation | +| `RunningAcceptanceTests` | GPU/network health validation | +| `Ready` | Cluster operational | +| `Degraded` | Some nodes unhealthy | +| `Paused` | Cluster paused | +| `OnDemandComputePaused` | On-demand compute paused (credit issue) | +| `Deleting` | Cluster being removed | + +## Volume Statuses + +| Status | Description | +|--------|-------------| +| `available` | Ready for attachment | +| `bound` | Attached to a cluster | +| `provisioning` | Being created | + +## Cluster Response Object + +```json +{ + "cluster_id": "abc-123-def-456", + "cluster_name": "my-gpu-cluster", + "cluster_type": "KUBERNETES", + "region": "us-central-8", + "gpu_type": "H100_SXM", + "num_gpus": 8, + "driver_version": "CUDA_12_6_560", + "duration_hours": 720, + "status": "Ready", + "control_plane_nodes": [...], + "gpu_worker_nodes": [...], + "volumes": [...], + "kube_config": "..." +} +``` diff --git a/plugins/togetherai/skills/together-gpu-clusters/references/cluster-management.md b/plugins/togetherai/skills/together-gpu-clusters/references/cluster-management.md new file mode 100644 index 00000000..630b41ec --- /dev/null +++ b/plugins/togetherai/skills/together-gpu-clusters/references/cluster-management.md @@ -0,0 +1,464 @@ +# GPU Cluster Management Reference +## Contents + +- [Cluster Architecture](#cluster-architecture) +- [Access Methods](#access-methods) +- [Slurm Configuration](#slurm-configuration) + - [Startup Scripts (Slinky v1.0 only)](#startup-scripts-slinky-v10-only) +- [GPU Access in Containers](#gpu-access-in-containers) +- [Scaling](#scaling) +- [Storage](#storage) +- [Health Checks](#health-checks) +- [User Management](#user-management) +- [Billing](#billing) +- [Troubleshooting](#troubleshooting) +- [Terraform](#terraform) + + +## Cluster Architecture + +### Kubernetes Mode +- Control Plane -- Manages cluster state, scheduling, API access +- Worker Nodes -- GPU-equipped nodes running workloads +- Networking -- High-speed InfiniBand for multi-node communication +- Storage Layer -- Persistent volumes, local NVMe, shared storage + +### Slurm on Kubernetes (Slinky) +- Slurm Controller -- Runs as K8s pods, manages job queues +- Login Nodes -- SSH-accessible entry points +- Compute Nodes -- GPU workers registered with both K8s and Slurm + +## Access Methods + +### Kubernetes Access + +```shell +# Get credentials +together beta clusters get-credentials +export KUBECONFIG=$HOME/.kube/config + +# Verify +kubectl get nodes +kubectl top nodes +kubectl get pods --all-namespaces +``` + +### Kubernetes Dashboard + +Access the dashboard URL from the cluster UI. Retrieve the admin token: + +```shell +kubectl -n kubernetes-dashboard get secret \ + $(kubectl -n kubernetes-dashboard get secret | grep admin-user-token | awk '{print $1}') \ + -o jsonpath='{.data.token}' | base64 -d | pbcopy +``` + +### SSH Access + +SSH keys must be added at `api.together.ai/settings/ssh-key` before cluster creation. + +```shell +# Direct SSH to worker nodes +ssh .cloud.together.ai + +# Slurm login node +ssh @slurm-login +``` + +### Slurm Commands + +```shell +sinfo # Node and partition status +squeue # Job queue +srun --gres=gpu:8 --pty bash # Interactive GPU session +sbatch script.sh # Submit batch job +scancel # Cancel job +scontrol show node # Detailed node info +scontrol show job # Job details +``` + +## Slurm Configuration + +Slurm clusters use four config files managed via a Kubernetes ConfigMap: + +- slurm.conf: Main cluster configuration (nodes, partitions, scheduling) +- gres.conf: GPU and generic resource definitions +- cgroup.conf: Control group resource management +- plugstack.conf: SPANK plugin configuration + +### Partition Configuration + +``` +PartitionName=gpu Nodes=gpu-nodes State=UP Default=NO MaxTime=24:00:00 +PartitionName=cpu Nodes=cpu-nodes State=UP Default=YES +``` + +### GPU Resource Configuration + +``` +Name=gpu Type=h100 File=/dev/nvidia[0-7] +``` + +### Scheduler Tuning + +``` +SchedulerParameters=batch_sched_delay=10,bf_interval=180,sched_max_job_start=500 +``` + +### Cgroup Settings + +``` +CgroupPlugin=cgroup/v1 +ConstrainCores=yes +ConstrainRAMSpace=yes +``` + +Changes require restarting the Slurm controller via `kubectl rollout restart` and verifying +with `scontrol` and `sinfo`. + +### Startup Scripts (Slinky v1.0 only) + +Lifecycle scripts that run automatically at node startup, job start, and job completion. +Configure under cluster Specs and configuration -> Slurm configuration -> Edit. Every +script must start with a shebang (`#!/bin/bash`); saving triggers a live Slurm reconfigure, +so test on a non-critical cluster first. + +| Script | Runs on | When | +|--------|---------|------| +| Worker init | Each worker | Node boot, before jobs | +| Login init | Login node | Login-node startup | +| Worker prolog | Each worker | Before job (first job step by default; see `PrologFlags=Alloc`) | +| Worker epilog | Each worker | After job ends | +| Controller prolog | `slurmctld` | At job allocation | +| Controller epilog | `slurmctld` | At job completion | +| Extra slurm.conf | All nodes | Appended verbatim to `slurm.conf` | + +Failure modes: + +- Worker prolog or epilog non-zero exit -> node set to `DRAIN`. A worker prolog failure additionally requeues batch jobs and cancels interactive jobs (`salloc`, `srun`). +- Controller prolog non-zero exit -> batch job requeued, interactive job cancelled; node not affected. +- Controller epilog non-zero exit -> logged, no other effect. + +Resume a drained node: + +```shell +sudo scontrol update NodeName= State=resume Reason="script fixed" +``` + +Rules: + +- Do not call Slurm commands (`squeue`, `scontrol`, `sacctmgr`) inside a prolog or epilog; this can deadlock the scheduler. +- By default the worker prolog runs at first job step, not at allocation. Add `PrologFlags=Alloc` to Extra slurm.conf to run at allocation. +- After edits, existing workers may keep cached scripts via Slurm's configless mechanism. New jobs on those workers continue using the old scripts until the worker restarts. +- Use `set -e` in init scripts so failures surface immediately; use `SLURM_JOB_ID` and `SLURM_JOB_USER` in prolog/epilog to scope cleanup to the running job. + +## GPU Access in Containers + +GPU devices are exposed by the runtime to all containers, but CUDA support depends on the +container image. Use CUDA-enabled images like `pytorch/pytorch` or `nvidia/cuda`. + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-workload-pod +spec: + restartPolicy: Never + containers: + - name: pytorch + image: pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime + command: ["/bin/bash", "-c", "sleep infinity"] + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: shared-storage + mountPath: /mnt/shared + volumes: + - name: shared-storage + persistentVolumeClaim: + claimName: shared-pvc +``` + +```shell +kubectl apply -f gpu-pod.yaml +kubectl wait --for=condition=Ready pod/gpu-workload-pod +kubectl exec -it gpu-workload-pod -- bash +nvidia-smi +``` + +## Scaling + +### Real-time Scaling + +Scale via UI, CLI, or API at any time. GPU count must be a multiple of 8. + +```python +from together import Together +client = Together() + +cluster = client.beta.clusters.update("cluster-id", num_gpus=16) +``` + +```shell +together beta clusters update --num-gpus 16 +``` + +### Autoscaling (Kubernetes) + +Enable autoscaling during cluster creation in the UI. The Kubernetes Cluster Autoscaler: +- Scales up when pods are pending due to insufficient resources +- Scales down when nodes are underutilized +- Respects pod disruption budgets + +### Targeted Scale-down + +```shell +# Kubernetes -- cordon specific nodes +kubectl cordon + +# Slurm -- drain specific nodes +sudo scontrol update NodeName= State=drain Reason="scaling down" +``` + +### Combining Capacity + +Use reserved for baseline + on-demand for bursts. + +## Storage + +### Types + +1. Local NVMe -- High-speed local I/O per node +2. Shared /home -- NFS-mounted across nodes (code, configs, logs) +3. Shared Volumes -- Multi-NIC, high-throughput persistent storage + +### Kubernetes PVCs + +Shared storage (ReadWriteMany): +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: shared-pvc +spec: + accessModes: [ReadWriteMany] + resources: + requests: + storage: 10Gi + volumeName: +``` + +Local storage (ReadWriteOnce): +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: local-pvc +spec: + accessModes: [ReadWriteOnce] + resources: + requests: + storage: 50Gi + storageClassName: local-storage-class +``` + +```shell +kubectl apply -f shared-pvc.yaml -n default +kubectl apply -f local-pvc.yaml -n default +kubectl get pvc -A +``` + +### Pod with Volumes + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: test-pod +spec: + restartPolicy: Never + containers: + - name: ubuntu + image: debian:stable-slim + command: ["/bin/sh", "-c", "sleep infinity"] + volumeMounts: + - name: shared-storage + mountPath: /mnt/shared + - name: local-storage + mountPath: /mnt/local + volumes: + - name: shared-storage + persistentVolumeClaim: + claimName: shared-pvc + - name: local-storage + persistentVolumeClaim: + claimName: local-pvc +``` + +### Data Upload + +```shell +# Small files +kubectl cp LOCAL_FILE POD_NAME:/data/ + +# Large datasets via S3 +# Deploy a data-loader pod with aws-cli +``` + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: data-loader +spec: + containers: + - name: downloader + image: amazon/aws-cli + command: ["aws", "s3", "cp", "s3://bucket/data", "/mnt/shared/", "--recursive"] + volumeMounts: + - name: shared-storage + mountPath: /mnt/shared + volumes: + - name: shared-storage + persistentVolumeClaim: + claimName: shared-pvc +``` + +## Health Checks + +### Automatic Acceptance Testing + +During provisioning, nodes undergo automatic tests: +- DCGM Diag (Level 2) -- GPU compute, memory, thermal validation +- GPU Burn (5 min) -- stress test for thermal/power issues +- Single-Node NCCL -- GPU-to-GPU communication within a node +- Multi-Node NCCL -- cross-node GPU communication + +Nodes showing "Tests Failed" are not added to the cluster until repaired. + +### Available Health Check Tests + +GPU Diagnostics: +- DCGM Diag (levels 1-3): NVIDIA Data Center GPU Manager diagnostics +- GPU Burn: intensive compute stress test + +Network Performance: +- Single-Node NCCL: intra-node GPU communication +- InfiniBand Write Bandwidth: high-speed interconnect performance + +PCIe Performance: +- NVBandwidth: CPU-to-GPU, GPU-to-CPU bandwidth, GPU-CPU latency + +### Node Repair + +- Quick Reprovision: VM recreated on a random physical node (for software issues) +- Migrate to New Host: New VM on different physical hardware (for hardware failures) + +Repair lifecycle: Cordon -> Drain -> Reprovision/Migrate -> Rejoin + +### Monitoring Commands + +```shell +# Check GPU status +nvidia-smi + +# Check for Xid errors +sudo dmesg | grep -i xid + +# Check GPU memory errors +nvidia-smi -q | grep -i ecc + +# Check temperature and throttling +nvidia-smi -q | grep -E 'Temperature|Throttle' + +# Check PCIe link status +nvidia-smi -q | grep -E 'Link Width|Link Speed' + +# Check running GPU processes +nvidia-smi pmon + +# Kubernetes monitoring +kubectl get nodes +kubectl top nodes +kubectl get pvc + +# Slurm monitoring +sinfo +squeue +scontrol show job +``` + +## User Management + +### Roles + +| Role | Control Plane | Data Plane | +|------|--------------|------------| +| Admin | Full write (create/delete/scale clusters and volumes) | Full SSH and kubectl | +| Member | Read-only (view only) | SSH and kubectl access | + +Only admins can add or remove users. Member permissions for in-cluster operations may vary +based on RBAC configuration. + +### Managing Users + +1. Navigate to Settings -> GPU Cluster Projects -> View Project +2. Add User (email) or Remove User +3. New users default to Member role; admins can promote afterward + +Access is always project-wide -- all clusters in a project share the same access list. + +Users require active Together AI accounts before they can be added. + +## Billing + +### Compute + +- Reserved: Upfront payment, 1-90 days, discounted. Non-refundable, non-cancellable. +- On-demand: Hourly billing, no commitment. Can terminate anytime. +- Hybrid: Reserved for baseline + on-demand for burst. + +### Storage + +- Pay-per-TiB, billed independently of cluster lifecycle +- Persists across cluster creation/deletion +- Can expand freely; contact support to reduce + +### Credit Exhaustion + +- Reserved compute: Runs until end date; overflow capacity decommissioned +- On-demand compute: Paused first, then decommissioned if credits not restored +- Storage: Access revoked, then data decommissioned + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Cluster stuck provisioning | Check the status for phases like `WaitingForControlPlaneNodes` or `RunningAcceptanceTests` | +| 400 "cuda version and nvidia driver version are required" | Pass `cuda_version` and `nvidia_driver_version` as separate fields alongside `driver_version` | +| 409 "Out of stock" | GPUs unavailable in the requested region. Call `list_regions()` and try another region | +| "Shared volume does not exist in the datacenter" | Volume was created in a different datacenter partition. Use inline `shared_volume` at cluster creation instead of a separate `volume_id` | +| Pods not scheduling | Verify node readiness with `kubectl get nodes` and inspect resource requests and taints | +| GPU not accessible in container | Use a CUDA-enabled image such as `pytorch/pytorch` or `nvidia/cuda` | +| Storage PVC not binding | Confirm the volume name matches the shared volume and inspect `kubectl get pvc` | +| Slurm job failures | Run `sinfo` to inspect partitions and `scontrol show job ` for details | +| Node health issues | Check `nvidia-smi`, inspect Xid errors in `dmesg`, and trigger repair from the UI if needed | + +## Terraform + +```hcl +resource "together_gpu_cluster" "training" { + name = "training-cluster" + num_gpus = 8 + instance_type = "H100-SXM" + region = "us-central-8" + billing_type = "prepaid" + reservation_days = 30 + + shared_volume { + name = "training-data" + size_tib = 5 + } +} +``` diff --git a/plugins/togetherai/skills/together-gpu-clusters/references/tcloud-cli.md b/plugins/togetherai/skills/together-gpu-clusters/references/tcloud-cli.md new file mode 100644 index 00000000..ffbc3cd3 --- /dev/null +++ b/plugins/togetherai/skills/together-gpu-clusters/references/tcloud-cli.md @@ -0,0 +1,313 @@ +# CLI Reference for GPU Clusters +## Contents + +- [Installation](#installation) +- [Cluster Commands](#cluster-commands) +- [Storage Commands](#storage-commands) +- [Instance Types](#instance-types) +- [Driver Versions](#driver-versions) + + +The Together GPU Clusters CLI is available in two forms: + +- Together CLI: `together beta clusters ` (included with the Together Python SDK) +- Standalone tcloud: `tcloud cluster ` (standalone binary) + +Both CLIs provide equivalent functionality. This reference uses the `together beta clusters` form. + +## Installation + +Together CLI (via pip): +```shell +uv pip install "together>=2.0.0" +together auth login +``` + +tcloud standalone binary: + +Mac (Universal): +```shell +curl -LO https://tcloud-cli-downloads.s3.us-west-2.amazonaws.com/releases/latest/tcloud-darwin-universal.tar.gz +tar xzf tcloud-darwin-universal.tar.gz +``` + +Linux (AMD64): +```shell +curl -LO https://tcloud-cli-downloads.s3.us-west-2.amazonaws.com/releases/latest/tcloud-linux-amd64.tar.gz +tar xzf tcloud-linux-amd64.tar.gz +``` + +Authenticate tcloud: +```shell +tcloud sso login +``` + +## Cluster Commands + +### `clusters create` + +Create a new GPU cluster. + +```shell +together beta clusters create [OPTIONS] +``` + +Options: + +| Flag | Type | Description | +|------|------|-------------| +| `--name` | string | Name of the cluster | +| `--num-gpus` | number | Number of GPUs (must be a multiple of 8) | +| `--gpu-type` | enum | `H100_SXM`, `H200_SXM`, `B200_SXM`, `H100_SXM_INF`, `L40_PCIE`, `RTX_6000_PCI` | +| `--region` | string | Region (use `clusters list-regions` to find valid regions) | +| `--billing-type` | enum | `ON_DEMAND` or `RESERVED` | +| `--duration-days` | number | Reservation length in days (only with `RESERVED` billing) | +| `--driver-version` | enum | CUDA driver version (use `clusters list-regions` for options) | +| `--cluster-type` | enum | `KUBERNETES` or `SLURM` | +| `--volume` | string | Existing storage volume ID to attach | +| `--json` | -- | Output in JSON format | + +Examples: + +```shell +# On-demand Kubernetes cluster with H100s +together beta clusters create \ + --name my-training-cluster \ + --num-gpus 8 \ + --gpu-type H100_SXM \ + --region us-central-8 \ + --driver-version CUDA_12_6_560 \ + --billing-type ON_DEMAND \ + --cluster-type KUBERNETES + +# Reserved Slurm cluster with H200s and attached storage +together beta clusters create \ + --name my-slurm-cluster \ + --num-gpus 16 \ + --gpu-type H200_SXM \ + --region us-central-8 \ + --driver-version CUDA_12_6_560 \ + --billing-type RESERVED \ + --duration-days 30 \ + --cluster-type SLURM \ + --volume +``` + +Equivalent tcloud command: +```shell +tcloud cluster create my-training-cluster \ + --num-gpus 8 \ + --instance-type H100-SXM \ + --region us-central-8 \ + --billing-type on_demand \ + --shared-volume-name my-volume \ + --size-tib 1 +``` + +### `clusters list` + +List all GPU clusters. + +```shell +together beta clusters list +``` + +### `clusters retrieve` + +Get details for a specific cluster. + +```shell +together beta clusters retrieve +``` + +### `clusters update` + +Update the configuration of an existing cluster (scale GPU count or change cluster type). + +```shell +together beta clusters update [OPTIONS] +``` + +Options: + +| Flag | Type | Description | +|------|------|-------------| +| `--num-gpus` | number | New GPU count (must be a multiple of 8) | +| `--cluster-type` | enum | `KUBERNETES` or `SLURM` | +| `--json` | -- | Output in JSON format | + +Example: + +```shell +# Scale up to 16 GPUs +together beta clusters update --num-gpus 16 + +# Switch to Slurm +together beta clusters update --cluster-type SLURM +``` + +Equivalent tcloud command: +```shell +tcloud cluster scale --num-gpus 16 +``` + +### `clusters delete` + +Delete a GPU cluster. + +```shell +together beta clusters delete +``` + +Equivalent tcloud command: +```shell +tcloud cluster delete +``` + +### `clusters list-regions` + +List available regions, supported GPU types, and driver versions. + +```shell +together beta clusters list-regions +``` + +Example output: + +```json +{ + "regions": [ + { + "driver_versions": [ + "CUDA_12_6_565", + "CUDA_12_5_555", + "CUDA_12_8_570", + "CUDA_12_9_575", + "CUDA_12_6_560", + "CUDA_12_4_550" + ], + "name": "us-central-8", + "supported_instance_types": [ + "H100_SXM", + "H200_SXM" + ] + } + ] +} +``` + +### `clusters get-credentials` + +Download Kubernetes credentials (kubeconfig) for a cluster. + +```shell +together beta clusters get-credentials [OPTIONS] +``` + +Options: + +| Flag | Type | Description | +|------|------|-------------| +| `--file` | path or `-` | Path to write kubeconfig. `-` prints to stdout. Default: `~/.kube/config` | +| `--context-name` | string | Name for the kubeconfig context. Default: cluster name | +| `--overwrite-existing` | -- | Overwrite existing kubeconfig entries on conflict | +| `--set-default-context` | -- | Set the new context as default for kubectl | + +Examples: + +```shell +# Write to default kubeconfig location +together beta clusters get-credentials + +# Write to a specific file +together beta clusters get-credentials --file ./kubeconfig.yaml + +# Print to stdout +together beta clusters get-credentials --file - + +# Overwrite and set as default +together beta clusters get-credentials \ + --overwrite-existing \ + --set-default-context + +# Use the cluster +export KUBECONFIG=~/.kube/config +kubectl get nodes +``` + +## Storage Commands + +Shared storage volumes are persistent, resizable, high-throughput storage backed by multi-NIC +bare metal paths. Volumes persist independently of cluster lifecycle. + +### `clusters storage create` + +Create a new shared storage volume. + +```shell +together beta clusters storage create [OPTIONS] +``` + +Options: + +| Flag | Type | Description | +|------|------|-------------| +| `--volume-name` | string | Name of the storage volume (required) | +| `--size-tib` | number | Size in tebibytes (required) | +| `--region` | string | Region to create the volume in (required) | +| `--json` | -- | Output in JSON format | + +Example: + +```shell +together beta clusters storage create \ + --volume-name my-training-data \ + --size-tib 2 \ + --region us-central-8 +``` + +### `clusters storage list` + +List all shared storage volumes. + +```shell +together beta clusters storage list +``` + +### `clusters storage retrieve` + +Get details for a specific volume. + +```shell +together beta clusters storage retrieve +``` + +### `clusters storage delete` + +Delete a shared storage volume. The volume must not be attached to any cluster. + +```shell +together beta clusters storage delete +``` + +## Instance Types + +| CLI Value | GPU | Memory | Notes | +|-----------|-----|--------|-------| +| `H100_SXM` | NVIDIA H100 | 80GB | InfiniBand networking | +| `H100_SXM_INF` | NVIDIA H100 | 80GB | Inference-optimized, lower IB bandwidth | +| `H200_SXM` | NVIDIA H200 | 141GB | InfiniBand networking | +| `B200_SXM` | NVIDIA B200 | 192GB | InfiniBand networking | +| `L40_PCIE` | NVIDIA L40 | 48GB | PCIe | +| `RTX_6000_PCI` | NVIDIA RTX 6000 | 24GB | PCIe | + +## Driver Versions + +Available CUDA driver versions (check `clusters list-regions` for per-region availability): + +- `CUDA_12_4_550` +- `CUDA_12_5_555` +- `CUDA_12_6_560` +- `CUDA_12_6_565` +- `CUDA_12_8_570` +- `CUDA_12_9_575` diff --git a/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.py b/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.py new file mode 100644 index 00000000..aad39ad7 --- /dev/null +++ b/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +""" +Together AI GPU Clusters -- Create, Monitor, Scale, Delete (v2 SDK) + +Full lifecycle: list regions, create cluster, wait for ready, +check status, scale, then delete. + +Usage: + python manage_cluster.py list-regions + python manage_cluster.py create --name my-cluster --region us-central-8 --gpu-type H100_SXM --num-gpus 8 --driver-version CUDA_12_6_560 --i-understand-costs + python manage_cluster.py demo --i-understand-costs --delete + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse +import time + +from together import Together + +client = Together() + + +def list_regions(): + """List available regions with supported GPUs and drivers.""" + regions = client.beta.clusters.list_regions() + for region in regions.regions: + print(f" {region.name}: GPUs={region.supported_instance_types}, Drivers={region.driver_versions}") + return regions + + +def list_clusters() -> list: + """List all GPU clusters.""" + response = client.beta.clusters.list() + for cluster in response.clusters: + print(f" {cluster.cluster_id}: {cluster.cluster_name} ({cluster.status}, {cluster.num_gpus} GPUs, {cluster.gpu_type})") + return response.clusters + + +def parse_driver_version(driver_version: str) -> tuple[str, str]: + """Extract cuda_version and nvidia_driver_version from a combined string. + + Example: "CUDA_12_6_560" -> ("12.6", "560") + """ + parts = driver_version.removeprefix("CUDA_").split("_") + cuda_version = f"{parts[0]}.{parts[1]}" + nvidia_driver = parts[2] + return cuda_version, nvidia_driver + + +def create_cluster( + name: str, + region: str, + gpu_type: str, + num_gpus: int, + driver_version: str, + billing_type: str = "ON_DEMAND", + cluster_type: str = "KUBERNETES", + volume_id: str | None = None, + shared_volume_name: str | None = None, + shared_volume_size_tib: int | None = None, +): + """Create a new GPU cluster. + + For shared storage, prefer ``shared_volume_name`` + ``shared_volume_size_tib`` + (inline creation) over ``volume_id`` to avoid datacenter-mismatch errors. + """ + cuda_ver, nvidia_ver = parse_driver_version(driver_version) + kwargs: dict = { + "cluster_name": name, + "region": region, + "gpu_type": gpu_type, + "num_gpus": num_gpus, + "driver_version": driver_version, + "billing_type": billing_type, + "cluster_type": cluster_type, + "extra_body": { + "cuda_version": cuda_ver, + "nvidia_driver_version": nvidia_ver, + }, + } + if volume_id: + kwargs["volume_id"] = volume_id + elif shared_volume_name and shared_volume_size_tib: + kwargs["shared_volume"] = { + "volume_name": shared_volume_name, + "size_tib": shared_volume_size_tib, + "region": region, + } + + cluster = client.beta.clusters.create(**kwargs) + print(f"Created cluster: {cluster.cluster_id} (status: {cluster.status})") + return cluster + + +def wait_for_ready(cluster_id: str, timeout: int = 1800, poll_interval: int = 30): + """Poll until cluster reaches Ready state.""" + elapsed = 0 + while elapsed < timeout: + cluster = client.beta.clusters.retrieve(cluster_id) + print(f" Status: {cluster.status} ({elapsed}s)") + + if cluster.status == "Ready": + return cluster + if cluster.status in ("Deleting",): + raise RuntimeError(f"Cluster is being deleted: {cluster_id}") + + time.sleep(poll_interval) + elapsed += poll_interval + + raise TimeoutError(f"Cluster not ready after {timeout}s") + + +def scale_cluster(cluster_id: str, num_gpus: int): + """Scale a cluster to a new GPU count.""" + cluster = client.beta.clusters.update(cluster_id, num_gpus=num_gpus) + print(f"Scaled cluster {cluster_id} to {num_gpus} GPUs (status: {cluster.status})") + return cluster + + +def delete_cluster(cluster_id: str) -> None: + """Delete a GPU cluster.""" + client.beta.clusters.delete(cluster_id) + print(f"Deleted cluster: {cluster_id}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI GPU cluster management") + subparsers = parser.add_subparsers(dest="command", required=True) + + subparsers.add_parser("list-regions", help="List available regions") + subparsers.add_parser("list", help="List existing clusters") + + create_parser = subparsers.add_parser("create", help="Create a cluster") + create_parser.add_argument("--name", required=True, help="Cluster name") + create_parser.add_argument("--region", required=True, help="Region name") + create_parser.add_argument("--gpu-type", required=True, help="GPU type") + create_parser.add_argument("--num-gpus", required=True, type=int, help="GPU count") + create_parser.add_argument("--driver-version", required=True, help="Driver version") + create_parser.add_argument("--billing-type", default="ON_DEMAND", help="Billing type") + create_parser.add_argument("--cluster-type", default="KUBERNETES", help="Cluster type") + create_parser.add_argument("--volume-id", help="Optional existing volume id (prefer --shared-volume-name)") + create_parser.add_argument("--shared-volume-name", help="Inline shared volume name (created with cluster)") + create_parser.add_argument("--shared-volume-size-tib", type=int, help="Inline shared volume size in TiB") + create_parser.add_argument( + "--i-understand-costs", + action="store_true", + help="Required acknowledgement before creating billable GPU infrastructure", + ) + + wait_parser = subparsers.add_parser("wait", help="Wait for a cluster to become ready") + wait_parser.add_argument("--cluster-id", required=True, help="Cluster id") + wait_parser.add_argument("--timeout", type=int, default=1800, help="Maximum wait time in seconds") + wait_parser.add_argument("--poll-interval", type=int, default=30, help="Seconds between polls") + + scale_parser = subparsers.add_parser("scale", help="Scale an existing cluster") + scale_parser.add_argument("--cluster-id", required=True, help="Cluster id") + scale_parser.add_argument("--num-gpus", required=True, type=int, help="New GPU count") + + delete_parser = subparsers.add_parser("delete", help="Delete a cluster") + delete_parser.add_argument("--cluster-id", required=True, help="Cluster id") + + demo_parser = subparsers.add_parser("demo", help="Run the full example flow") + demo_parser.add_argument("--name", default="my-training-cluster", help="Cluster name") + demo_parser.add_argument("--region", default="us-central-8", help="Region name") + demo_parser.add_argument("--gpu-type", default="H100_SXM", help="GPU type") + demo_parser.add_argument("--num-gpus", type=int, default=8, help="Initial GPU count") + demo_parser.add_argument("--driver-version", default="CUDA_12_6_560", help="Driver version") + demo_parser.add_argument("--billing-type", default="ON_DEMAND", help="Billing type") + demo_parser.add_argument("--cluster-type", default="KUBERNETES", help="Cluster type") + demo_parser.add_argument("--volume-id", help="Optional existing volume id (prefer --shared-volume-name)") + demo_parser.add_argument("--shared-volume-name", help="Inline shared volume name (created with cluster)") + demo_parser.add_argument("--shared-volume-size-tib", type=int, help="Inline shared volume size in TiB") + demo_parser.add_argument("--scale-to", type=int, help="Optional GPU count to scale to after creation") + demo_parser.add_argument("--timeout", type=int, default=1800, help="Maximum wait time in seconds") + demo_parser.add_argument("--poll-interval", type=int, default=30, help="Seconds between polls") + demo_parser.add_argument("--delete", action="store_true", help="Delete the cluster at the end") + demo_parser.add_argument( + "--i-understand-costs", + action="store_true", + help="Required acknowledgement before creating billable GPU infrastructure", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if args.command == "list-regions": + list_regions() + return + if args.command == "list": + list_clusters() + return + if args.command == "create": + if not args.i_understand_costs: + raise SystemExit("Creating GPU clusters is billable. Pass --i-understand-costs after reviewing the target region, GPU count, and cleanup plan.") + create_cluster( + name=args.name, + region=args.region, + gpu_type=args.gpu_type, + num_gpus=args.num_gpus, + driver_version=args.driver_version, + billing_type=args.billing_type, + cluster_type=args.cluster_type, + volume_id=args.volume_id, + shared_volume_name=args.shared_volume_name, + shared_volume_size_tib=args.shared_volume_size_tib, + ) + return + if args.command == "wait": + wait_for_ready(args.cluster_id, timeout=args.timeout, poll_interval=args.poll_interval) + return + if args.command == "scale": + scale_cluster(args.cluster_id, args.num_gpus) + return + if args.command == "delete": + delete_cluster(args.cluster_id) + return + + if not args.i_understand_costs: + raise SystemExit("The demo creates billable GPU infrastructure. Pass --i-understand-costs after reviewing the defaults and cleanup plan.") + if not args.delete: + print("Warning: demo will leave the cluster running. Pass --delete to clean it up at the end.") + + print("Available regions:") + list_regions() + print("\nExisting clusters:") + list_clusters() + cluster = create_cluster( + name=args.name, + region=args.region, + gpu_type=args.gpu_type, + num_gpus=args.num_gpus, + driver_version=args.driver_version, + billing_type=args.billing_type, + cluster_type=args.cluster_type, + volume_id=args.volume_id, + shared_volume_name=args.shared_volume_name, + shared_volume_size_tib=args.shared_volume_size_tib, + ) + print("\nWaiting for cluster to be ready...") + cluster = wait_for_ready(cluster.cluster_id, timeout=args.timeout, poll_interval=args.poll_interval) + print(f"Cluster ready: {cluster.cluster_name}") + if args.scale_to: + print(f"\nScaling to {args.scale_to} GPUs...") + scale_cluster(cluster.cluster_id, args.scale_to) + wait_for_ready(cluster.cluster_id, timeout=args.timeout, poll_interval=args.poll_interval) + if args.delete: + delete_cluster(cluster.cluster_id) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.ts b/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.ts new file mode 100644 index 00000000..430f55a4 --- /dev/null +++ b/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_cluster.ts @@ -0,0 +1,143 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI GPU Clusters -- Create, Monitor, Scale, Delete + * + * Full lifecycle: list regions, create cluster, wait for ready, + * check status, scale, then delete. + * + * Usage: + * npx tsx manage_cluster.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; +import type { ClusterCreateParams } from "together-ai/resources/beta/clusters/clusters"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +type GPUType = ClusterCreateParams["gpu_type"]; +type DriverVersion = ClusterCreateParams["driver_version"]; +type BillingType = ClusterCreateParams["billing_type"]; +type ClusterType = NonNullable; + +async function listRegions(): Promise { + console.log("=== Available Regions ==="); + const regions = await client.beta.clusters.listRegions(); + for (const r of regions.regions) { + console.log(` ${r.name}: GPUs=${JSON.stringify(r.supported_instance_types)}`); + } +} + +async function listClusters(): Promise { + console.log("\n=== Existing Clusters ==="); + const response = await client.beta.clusters.list(); + for (const c of response.clusters) { + console.log(` ${c.cluster_id}: ${c.cluster_name} (${c.status}, ${c.num_gpus} GPUs)`); + } +} + +function parseDriverVersion(driverVersion: string): { cudaVersion: string; nvidiaDriver: string } { + const parts = driverVersion.replace("CUDA_", "").split("_"); + return { cudaVersion: `${parts[0]}.${parts[1]}`, nvidiaDriver: parts[2] }; +} + +async function createCluster( + name: string, + region: string, + gpuType: GPUType, + numGpus: number, + driverVersion: DriverVersion, + billingType: BillingType = "ON_DEMAND", + clusterType: ClusterType = "KUBERNETES", +): Promise { + const { cudaVersion, nvidiaDriver } = parseDriverVersion(driverVersion); + const cluster = await client.beta.clusters.create({ + cluster_name: name, + region, + gpu_type: gpuType, + num_gpus: numGpus, + driver_version: driverVersion, + billing_type: billingType, + cluster_type: clusterType, + // @ts-expect-error -- required by API but not yet in SDK types + cuda_version: cudaVersion, + nvidia_driver_version: nvidiaDriver, + }); + console.log(`Created cluster: ${cluster.cluster_id} (status: ${cluster.status})`); + return cluster; +} + +async function waitForReady( + clusterId: string, + timeoutMs: number = 1_800_000, + pollMs: number = 30_000, +): Promise { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + const cluster = await client.beta.clusters.retrieve(clusterId); + const elapsed = Math.round((Date.now() - start) / 1000); + console.log(` Status: ${cluster.status} (${elapsed}s)`); + + if (cluster.status === "Ready") return cluster; + if (cluster.status === "Deleting") { + throw new Error(`Cluster is being deleted: ${clusterId}`); + } + + await new Promise((r) => setTimeout(r, pollMs)); + } + throw new Error(`Cluster not ready after ${timeoutMs / 1000}s`); +} + +async function scaleCluster(clusterId: string, numGpus: number): Promise { + const cluster = await client.beta.clusters.update(clusterId, { + num_gpus: numGpus, + }); + console.log(`Scaled to ${numGpus} GPUs (status: ${cluster.status})`); + return cluster; +} + +async function deleteCluster(clusterId: string): Promise { + await client.beta.clusters.delete(clusterId); + console.log(`Deleted cluster: ${clusterId}`); +} + +async function main(): Promise { + const CLUSTER_NAME = "my-training-cluster"; + const REGION = "us-central-8"; + const GPU_TYPE = "H100_SXM"; + const NUM_GPUS = 8; + const DRIVER = "CUDA_12_6_560"; + + // 1. List available regions + await listRegions(); + + // 2. List existing clusters + await listClusters(); + + // 3. Create a cluster + const cluster = await createCluster( + CLUSTER_NAME, REGION, GPU_TYPE, NUM_GPUS, DRIVER, + ); + + // 4. Wait for cluster to be ready + console.log("\nWaiting for cluster to be ready..."); + const ready = await waitForReady(cluster.cluster_id); + console.log(`Cluster ready: ${ready.cluster_name}`); + + // 5. Scale up to 16 GPUs + console.log("\nScaling to 16 GPUs..."); + await scaleCluster(cluster.cluster_id, 16); + + // 6. Wait for scaling to complete + await waitForReady(cluster.cluster_id); + + // 7. Delete when done (uncomment to delete) + // await deleteCluster(cluster.cluster_id); +} + +main(); diff --git a/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_storage.py b/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_storage.py new file mode 100644 index 00000000..382e1c6d --- /dev/null +++ b/plugins/togetherai/skills/together-gpu-clusters/scripts/manage_storage.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Together AI GPU Clusters -- Shared Storage Management (v2 SDK) + +Create, list, resize, and delete shared storage volumes for GPU clusters. + +Usage: + python manage_storage.py list + python manage_storage.py create --name my-training-data --size-tib 2 --region us-central-8 + python manage_storage.py demo + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import argparse + +from together import Together + +client = Together() + + +def create_volume(name: str, size_tib: int, region: str): + """Create a new shared storage volume.""" + volume = client.beta.clusters.storage.create( + volume_name=name, + size_tib=size_tib, + region=region, + ) + print(f"Created volume: {volume.volume_id} ({volume.volume_name}, {volume.size_tib} TiB, {volume.status})") + return volume + + +def list_volumes() -> list: + """List all shared storage volumes.""" + response = client.beta.clusters.storage.list() + for volume in response.volumes: + print(f" {volume.volume_id}: {volume.volume_name} ({volume.size_tib} TiB, {volume.status})") + return response.volumes + + +def retrieve_volume(volume_id: str): + """Get details for a specific volume.""" + volume = client.beta.clusters.storage.retrieve(volume_id) + print(f"Volume: {volume.volume_name}") + print(f" ID: {volume.volume_id}") + print(f" Size: {volume.size_tib} TiB") + print(f" Status: {volume.status}") + return volume + + +def resize_volume(volume_id: str, new_size_tib: int): + """Resize a shared storage volume.""" + volume = client.beta.clusters.storage.update( + volume_id=volume_id, + size_tib=new_size_tib, + ) + print(f"Resized volume {volume_id} to {volume.size_tib} TiB") + return volume + + +def delete_volume(volume_id: str) -> None: + """Delete a shared storage volume.""" + client.beta.clusters.storage.delete(volume_id) + print(f"Deleted volume: {volume_id}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Together AI shared storage management") + subparsers = parser.add_subparsers(dest="command", required=True) + + subparsers.add_parser("list", help="List volumes") + + create_parser = subparsers.add_parser("create", help="Create a volume") + create_parser.add_argument("--name", required=True, help="Volume name") + create_parser.add_argument("--size-tib", required=True, type=int, help="Volume size in TiB") + create_parser.add_argument("--region", required=True, help="Region name") + + retrieve_parser = subparsers.add_parser("get", help="Retrieve a volume") + retrieve_parser.add_argument("--volume-id", required=True, help="Volume id") + + resize_parser = subparsers.add_parser("resize", help="Resize a volume") + resize_parser.add_argument("--volume-id", required=True, help="Volume id") + resize_parser.add_argument("--size-tib", required=True, type=int, help="New size in TiB") + + delete_parser = subparsers.add_parser("delete", help="Delete a volume") + delete_parser.add_argument("--volume-id", required=True, help="Volume id") + + demo_parser = subparsers.add_parser("demo", help="Run the full example flow") + demo_parser.add_argument("--name", default="my-training-data", help="Volume name") + demo_parser.add_argument("--region", default="us-central-8", help="Region name") + demo_parser.add_argument("--size-tib", type=int, default=2, help="Initial size in TiB") + demo_parser.add_argument("--resize-to", type=int, default=5, help="New size in TiB") + demo_parser.add_argument("--delete", action="store_true", help="Delete the volume at the end") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if args.command == "list": + list_volumes() + return + if args.command == "create": + create_volume(args.name, args.size_tib, args.region) + return + if args.command == "get": + retrieve_volume(args.volume_id) + return + if args.command == "resize": + resize_volume(args.volume_id, args.size_tib) + return + if args.command == "delete": + delete_volume(args.volume_id) + return + + volume = create_volume(args.name, args.size_tib, args.region) + print("\nAll volumes:") + list_volumes() + print("\nVolume details:") + retrieve_volume(volume.volume_id) + print(f"\nResizing to {args.resize_to} TiB...") + resize_volume(volume.volume_id, args.resize_to) + if args.delete: + delete_volume(volume.volume_id) + + +if __name__ == "__main__": + main() diff --git a/plugins/togetherai/skills/together-images/SKILL.md b/plugins/togetherai/skills/together-images/SKILL.md new file mode 100644 index 00000000..ef24e321 --- /dev/null +++ b/plugins/togetherai/skills/together-images/SKILL.md @@ -0,0 +1,82 @@ +--- +name: together-images +description: "Text-to-image generation and image editing via Together AI, including FLUX and Kontext models, LoRA-based styling, reference-image guidance, and local image downloads. Reach for it whenever the user wants to generate or edit images on Together AI rather than create videos or build text-only chat applications." +--- + +# Together Images + +## Overview + +Use Together AI image APIs for: + +- text-to-image generation +- image editing with Kontext +- FLUX.2-specific options +- LoRA adapters +- reference-image guidance + +## When This Skill Wins + +- Generate still images from prompts +- Edit an existing image with text guidance +- Apply LoRA styles to FLUX models +- Choose image models or dimensions for a product workflow + +## Hand Off To Another Skill + +- Use `together-video` for motion or video generation +- Use `together-chat-completions` for text-only generation +- Use `together-dedicated-containers` only when the user needs a custom image runtime rather than the managed API + +## Quick Routing + +- Basic text-to-image + - Start with [scripts/generate_image.py](scripts/generate_image.py) or [scripts/generate_image.ts](scripts/generate_image.ts) + - Read [references/api-reference.md](references/api-reference.md) +- Multiple variations, base64 output, or seeded runs + - Start with [scripts/generate_image.py](scripts/generate_image.py) or [scripts/generate_image.ts](scripts/generate_image.ts) + - Read [references/api-reference.md](references/api-reference.md) +- Image editing with Kontext + - Start with [scripts/kontext_editing.py](scripts/kontext_editing.py) + - Read [references/api-reference.md](references/api-reference.md) +- Generate then edit (e.g. product photos) + - Start with [scripts/kontext_editing.py](scripts/kontext_editing.py) (Example 7) + - Generate with FLUX, feed the URL to Kontext, save both locally +- LoRA styling + - Start with [scripts/lora_generation.py](scripts/lora_generation.py) + - Read [references/api-reference.md](references/api-reference.md) +- Model and dimension selection + - Read [references/models.md](references/models.md) + +## Workflow + +1. Confirm whether the task is generation, editing, or style transfer. +2. Choose the model family and output dimensions first. +3. Add reference images, LoRAs, or FLUX.2-only parameters only when the use case needs them. +4. Generate the asset, then download or decode it into the expected local format. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Match the script to the workflow type instead of packing every image feature into one request path. +- Keep model selection explicit because FLUX, Kontext, and partner models differ in capabilities. +- Preserve reproducibility with seeds when the user needs stable outputs. +- For editing or reference-image flows, validate that the chosen model actually supports the feature. + +## Resource Map + +- API reference: [references/api-reference.md](references/api-reference.md) +- Troubleshooting and generation tuning: [references/api-reference.md](references/api-reference.md) +- Model guide: [references/models.md](references/models.md) +- Python image generation: [scripts/generate_image.py](scripts/generate_image.py) +- TypeScript image generation: [scripts/generate_image.ts](scripts/generate_image.ts) +- Python Kontext editing: [scripts/kontext_editing.py](scripts/kontext_editing.py) +- Python LoRA generation: [scripts/lora_generation.py](scripts/lora_generation.py) + +## Official Docs + +- [Images Overview](https://docs.together.ai/docs/images-overview) +- [FLUX.2 Quickstart](https://docs.together.ai/docs/quickstart-flux) +- [FLUX Kontext](https://docs.together.ai/docs/quickstart-flux-kontext) +- [FLUX LoRA](https://docs.together.ai/docs/quickstart-flux-lora) +- [Image Generation API](https://docs.together.ai/reference/post-images-generations) diff --git a/plugins/togetherai/skills/together-images/references/api-reference.md b/plugins/togetherai/skills/together-images/references/api-reference.md new file mode 100644 index 00000000..f86480fe --- /dev/null +++ b/plugins/togetherai/skills/together-images/references/api-reference.md @@ -0,0 +1,368 @@ +# Image Generation API Reference +## Contents + +- [Endpoint](#endpoint) +- [Parameters](#parameters) +- [Text-to-Image](#text-to-image) +- [Multiple Variations](#multiple-variations) +- [FLUX.2 Generation](#flux2-generation) +- [Image Editing (Kontext)](#image-editing) +- [Reference Images (FLUX.2, Google)](#reference-images) +- [LoRA Adapters](#lora-adapters) +- [Response](#response) +- [Steps Guide](#steps-guide) +- [Dimensions Guide](#dimensions-guide) +- [Model Feature Matrix](#model-feature-matrix) +- [Troubleshooting](#troubleshooting) + + +## Endpoint + +`POST https://api.together.xyz/v1/images/generations` + +## Parameters + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `prompt` | string | Yes | - | Text description of image to generate | +| `model` | string | Yes | - | Model identifier | +| `width` | integer | No | 1024 | Image width in pixels (256-1920) | +| `height` | integer | No | 1024 | Image height in pixels (256-1920) | +| `n` | integer | No | 1 | Number of images (1-4) | +| `steps` | integer | No | varies | Diffusion steps (1-50) | +| `seed` | integer | No | random | Random seed for reproducibility | +| `negative_prompt` | string | No | - | What to avoid in generation | +| `response_format` | string | No | `"url"` | `"base64"` for inline data, `"url"` for hosted | +| `image_url` | string | No | - | Reference image URL (Kontext models) | +| `reference_images` | array | No | - | Reference image URLs (FLUX.2, Google models) | +| `image_loras` | array | No | - | LoRA adapters: `[{path, scale}]` (max 2) | +| `guidance` | float | No | - | Guidance scale for FLUX.2 dev/flex | +| `guidance_scale` | number | No | 3.5 | Prompt alignment (1-5 creative, 8-10 faithful) | +| `prompt_upsampling` | bool | No | true | Auto-enhance prompts (FLUX.2) | +| `output_format` | string | No | `"jpeg"` | `"jpeg"` or `"png"` (FLUX.2) | +| `aspect_ratio` | string | No | - | For Schnell/Kontext: 1:1, 16:9, 9:16, 4:3, 3:2 | +| `disable_safety_checker` | bool | No | false | Disable NSFW check | + +## Text-to-Image + +```python +from together import Together +client = Together() + +response = client.images.generate( + prompt="A sunset over mountains", + model="black-forest-labs/FLUX.2-dev", + width=1024, + height=1024, + steps=20, + n=1, +) +print(response.data[0].url) +``` + +```typescript +import Together from "together-ai"; +const together = new Together(); + +const response = await together.images.generate({ + prompt: "A sunset over mountains", + model: "black-forest-labs/FLUX.2-dev", + width: 1024, + height: 1024, + steps: 20, + n: 1, +}); +console.log(response.data[0].url); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/images/generations" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "A sunset over mountains", + "model": "black-forest-labs/FLUX.2-dev", + "width": 1024, + "height": 1024, + "steps": 20, + "n": 1 + }' +``` + +## Multiple Variations + +Use `n` to request multiple candidate images from the same prompt in one call: + +```python +response = client.images.generate( + prompt="A cozy reading nook with warm afternoon light", + model="black-forest-labs/FLUX.2-dev", + width=1024, + height=1024, + steps=20, + n=4, +) + +for image in response.data: + print(image.url) +``` + +```typescript +const response = await together.images.generate({ + prompt: "A cozy reading nook with warm afternoon light", + model: "black-forest-labs/FLUX.2-dev", + width: 1024, + height: 1024, + steps: 20, + n: 4, +}); + +for (const image of response.data) { + console.log(image.url); +} +``` + +## FLUX.2 Generation + +FLUX.2 models support `prompt_upsampling`, `output_format`, `guidance`, and +`reference_images`. + +```python +response = client.images.generate( + model="black-forest-labs/FLUX.2-pro", + prompt="A mountain landscape at sunset with golden light", + width=1024, + height=768, + prompt_upsampling=True, + output_format="png", +) +``` + +```typescript +const response = await together.images.generate({ + model: "black-forest-labs/FLUX.2-pro", + prompt: "A mountain landscape at sunset with golden light", + width: 1024, + height: 768, + prompt_upsampling: true, + output_format: "png", +}); +``` + +### FLUX.2 Dev/Flex with Guidance + +```python +response = client.images.generate( + model="black-forest-labs/FLUX.2-dev", + prompt="A detailed portrait in oil painting style", + width=1024, + height=1024, + steps=28, + guidance=7.5, +) +``` + +## Image Editing (Kontext) + +For Kontext models, provide a reference image and editing instructions: + +```python +response = client.images.generate( + model="black-forest-labs/FLUX.1-kontext-pro", + prompt="Make his shirt yellow", + image_url="https://github.com/nutlope.png", + width=1536, + height=1024, + steps=28, +) +``` + +```typescript +const response = await together.images.generate({ + model: "black-forest-labs/FLUX.1-kontext-pro", + prompt: "Make his shirt yellow", + image_url: "https://github.com/nutlope.png", + width: 1536, + height: 1024, + steps: 28, +}); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/images/generations" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "black-forest-labs/FLUX.1-kontext-pro", + "prompt": "Make his shirt yellow", + "image_url": "https://github.com/nutlope.png", + "width": 1536, + "height": 1024, + "steps": 28 + }' +``` + +## Reference Images (FLUX.2, Google) + +Use `reference_images` for multi-image guidance: + +```python +response = client.images.generate( + model="black-forest-labs/FLUX.2-pro", + prompt="Replace the color of the car to blue", + width=1024, + height=768, + reference_images=[ + "https://images.pexels.com/photos/3729464/pexels-photo-3729464.jpeg", + ], +) +``` + +```typescript +const response = await together.images.generate({ + model: "black-forest-labs/FLUX.2-pro", + prompt: "Replace the color of the car to blue", + width: 1024, + height: 768, + reference_images: [ + "https://images.pexels.com/photos/3729464/pexels-photo-3729464.jpeg", + ], +}); +``` + +## LoRA Adapters + +Apply up to 2 LoRA adapters per image. Compatible with FLUX.2 Dev and FLUX.1 Dev. + +```python +response = client.images.generate( + model="black-forest-labs/FLUX.2-dev", + prompt="a man walking outside on a rainy day", + width=1024, + height=768, + steps=28, + image_loras=[ + {"path": "https://huggingface.co/XLabs-AI/flux-RealismLora", "scale": 0.8}, + ], +) +``` + +```typescript +const response = await together.images.generate({ + model: "black-forest-labs/FLUX.2-dev", + prompt: "a man walking outside on a rainy day", + width: 1024, + height: 768, + steps: 28, + image_loras: [ + { path: "https://huggingface.co/XLabs-AI/flux-RealismLora", scale: 0.8 }, + ], +}); +``` + +```shell +curl -X POST "https://api.together.xyz/v1/images/generations" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "black-forest-labs/FLUX.2-dev", + "prompt": "a man walking outside on a rainy day", + "width": 1024, + "height": 768, + "steps": 28, + "image_loras": [ + {"path": "https://huggingface.co/XLabs-AI/flux-RealismLora", "scale": 0.8} + ] + }' +``` + +### LoRA Path Formats + +- Hugging Face repo: `https://huggingface.co/XLabs-AI/flux-RealismLora` +- Hugging Face file: `https://huggingface.co/.../resolve/main/model.safetensors` +- CivitAI: `https://civitai.com/api/download/models/...` +- Replicate: `https://replicate.com/fofr/flux-black-light` +- Direct `.safetensors` URL + +### LoRA Scale Guide + +- `0.3-0.5`: Subtle effect +- `0.6-0.8`: Balanced (recommended) +- `0.9-1.2`: Strong effect + +## Response + +```json +{ + "id": "img-abc123", + "model": "black-forest-labs/FLUX.2-dev", + "object": "list", + "data": [ + { + "index": 0, + "url": "https://api.together.ai/v1/images/...", + "type": "url" + } + ] +} +``` + +With `response_format="base64"`: + +```json +{ + "id": "img-abc123", + "model": "black-forest-labs/FLUX.2-dev", + "object": "list", + "data": [ + { + "index": 0, + "b64_json": "iVBORw0KGgo...", + "type": "b64_json", + "timings": { "inference": 0.799 } + } + ] +} +``` + +## Steps Guide + +| Steps | Effect | +|-------|--------| +| 1-4 | Fast, lower quality (FLUX.1 Schnell default: 4) | +| 10-20 | Good balance of speed and quality | +| 28 | High quality (Kontext, FLUX.1 Dev default) | +| 30-50 | Maximum quality, slower | + +## Dimensions Guide + +| Aspect Ratio | Dimensions | Use Case | +|-------------|-----------|----------| +| 1:1 | 1024x1024 | Square, social media | +| 16:9 | 1344x768 | Landscape, widescreen | +| 9:16 | 768x1344 | Portrait, mobile | +| 3:2 | 1248x832 | Photography standard | +| 4:3 | 1184x864 | Classic ratio | + +## Model Feature Matrix + +| Feature | FLUX.2 | FLUX.1 Schnell | FLUX.1 Kontext | Google | +|---------|--------|---------------|---------------|--------| +| Text-to-image | Yes | Yes | Yes | Yes | +| `image_url` | Pro/Flex | No | Yes | No | +| `reference_images` | Yes | No | No | Yes | +| `image_loras` | Dev | No | No | No | +| `prompt_upsampling` | Yes | No | No | No | +| `guidance` | Dev/Flex | No | No | No | +| `output_format` | Yes | No | No | No | +| `negative_prompt` | No | Yes | No | No | + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Prompt mismatch | Add descriptive language, style references, and increase steps | +| Poor quality | Use 30-40 steps and add quality modifiers such as "highly detailed" | +| Inconsistent results | Set `seed` for reproducibility | +| Wrong dimensions | Ensure width and height are multiples of 8 and use standard aspect ratios | +| LoRA not applying | Verify the `.safetensors` URL is accessible and adjust `scale` between 0.3 and 1.2 | diff --git a/plugins/togetherai/skills/together-images/references/models.md b/plugins/togetherai/skills/together-images/references/models.md new file mode 100644 index 00000000..47edea2e --- /dev/null +++ b/plugins/togetherai/skills/together-images/references/models.md @@ -0,0 +1,116 @@ +# Image Generation Models Reference +## Contents + +- [Complete Model Table](#complete-model-table) +- [Model Categories](#model-categories) +- [Recommended Models](#recommended-models) +- [FLUX.2 Model Comparison](#flux2-model-comparison) +- [Supported Dimensions](#supported-dimensions) +- [FLUX Pricing Formula](#flux-pricing-formula) + + +## Complete Model Table + +| Organization | Model | API String | Default Steps | +|-------------|-------|-----------|--------------| +| Google | Imagen 4.0 Preview | `google/imagen-4.0-preview` | - | +| Google | Imagen 4.0 Fast | `google/imagen-4.0-fast` | - | +| Google | Imagen 4.0 Ultra | `google/imagen-4.0-ultra` | - | +| Google | Flash Image 2.5 | `google/flash-image-2.5` | - | +| Google | Gemini 3 Pro Image | `google/gemini-3-pro-image` | - | +| Black Forest Labs | FLUX.2 [pro] | `black-forest-labs/FLUX.2-pro` | - | +| Black Forest Labs | FLUX.2 [dev] | `black-forest-labs/FLUX.2-dev` | - | +| Black Forest Labs | FLUX.2 [flex] | `black-forest-labs/FLUX.2-flex` | - | +| Black Forest Labs | FLUX.1 [schnell] | `black-forest-labs/FLUX.1-schnell` | 4 | +| Black Forest Labs | FLUX.1.1 [pro] | `black-forest-labs/FLUX.1.1-pro` | - | +| Black Forest Labs | FLUX.1 Kontext [pro] | `black-forest-labs/FLUX.1-kontext-pro` | 28 | +| Black Forest Labs | FLUX.1 Kontext [max] | `black-forest-labs/FLUX.1-kontext-max` | 28 | +| ByteDance | Seedream 4.0 | `ByteDance-Seed/Seedream-4.0` | - | +| ByteDance | Seedream 3.0 | `ByteDance-Seed/Seedream-3.0` | - | +| Qwen | Qwen Image | `Qwen/Qwen-Image` | - | +| Ideogram | Ideogram 3.0 | `ideogram/ideogram-3.0` | - | +| Ideogram | Ideogram 4.0 | `ideogram/ideogram-4.0` | - | +| HiDream | HiDream-I1-Full | `HiDream-ai/HiDream-I1-Full` | - | +| HiDream | HiDream-I1-Dev | `HiDream-ai/HiDream-I1-Dev` | - | +| HiDream | HiDream-I1-Fast | `HiDream-ai/HiDream-I1-Fast` | - | +| RunDiffusion | Juggernaut Pro Flux | `RunDiffusion/Juggernaut-pro-flux` | - | +| RunDiffusion | Juggernaut Lightning | `Rundiffusion/Juggernaut-Lightning-Flux` | - | +| Lykon | DreamShaper | `Lykon/DreamShaper` | - | +| Stability AI | SD 3 Medium | `stabilityai/stable-diffusion-3-medium` | - | +| Stability AI | SD 3 Medium | `stabilityai/stable-diffusion-3-medium` | - | + +## Model Categories + +### Text-to-Image (All models) + +All models above support text-to-image generation via the `prompt` parameter. + +### Image Editing (single reference via `image_url`) + +- `black-forest-labs/FLUX.1-kontext-pro` -- Balanced speed/quality (recommended) +- `black-forest-labs/FLUX.1-kontext-max` -- Maximum editing quality +- `black-forest-labs/FLUX.2-pro` -- FLUX.2 editing +- `black-forest-labs/FLUX.2-flex` -- Adjustable guidance + +### Multi-Image Guidance (via `reference_images`) + +- `black-forest-labs/FLUX.2-pro` +- `black-forest-labs/FLUX.2-dev` +- `black-forest-labs/FLUX.2-flex` +- `google/gemini-3-pro-image` +- `google/flash-image-2.5` + +### LoRA Compatible (via `image_loras`) + +- `black-forest-labs/FLUX.2-dev` -- Up to 2 LoRAs, scale 0.3-1.2 + +## Recommended Models + +| Use Case | Model | API String | +|----------|-------|-----------| +| Best quality | Flash Image 2.5 | `google/flash-image-2.5` | +| Highest quality FLUX | FLUX.2 Pro | `black-forest-labs/FLUX.2-pro` | +| Image editing | FLUX.1 Kontext Max | `black-forest-labs/FLUX.1-kontext-max` | +| Fast generation | FLUX.1 Schnell | `black-forest-labs/FLUX.1-schnell` | +| LoRA styles | FLUX.2 Dev | `black-forest-labs/FLUX.2-dev` | +| Typography | FLUX.2 Flex | `black-forest-labs/FLUX.2-flex` | +| Text in images | Ideogram 3.0 | `ideogram/ideogram-3.0` | +| Up to 4K output | Gemini 3 Pro Image | `google/gemini-3-pro-image` | + +## FLUX.2 Model Comparison + +| Variant | Best For | Unique Features | +|---------|----------|-----------------| +| Pro | Production, highest fidelity | Up to 9MP output, fastest | +| Dev | Development, LoRA support | `guidance`, `steps`, `image_loras` | +| Flex | Maximum control, typography | `guidance`, `steps`, adjustable | + +## Supported Dimensions + +### Standard (most models) + +- 1024x1024 (1:1), 1344x768 (16:9), 768x1344 (9:16) +- 1248x832 (3:2), 832x1248 (2:3) +- 1184x864 (4:3), 864x1184 (3:4) + +### Gemini 3 Pro Image -- 1K + +1024x1024, 1248x832, 832x1248, 1184x864, 864x1184, 896x1152, 1152x896, 768x1344, 1344x768, +1536x672 + +### Gemini 3 Pro Image -- 2K + +2048x2048, 2496x1664, 1664x2496, 2368x1728, 1728x2368, 1792x2304, 2304x1792, 1536x2688, +2688x1536, 3072x1344 + +### Gemini 3 Pro Image -- 4K + +4096x4096, 4992x3328, 3328x4992, 4736x3456, 3456x4736, 3584x4608, 4608x3584, 3072x5376, +5376x3072, 6144x2688 + +## FLUX Pricing Formula + +``` +Cost = MP x Price_per_MP x (Steps / Default_Steps) +MP = Width x Height / 1,000,000 +``` diff --git a/plugins/togetherai/skills/together-images/scripts/generate_image.py b/plugins/togetherai/skills/together-images/scripts/generate_image.py new file mode 100644 index 00000000..cccf9dfd --- /dev/null +++ b/plugins/togetherai/skills/together-images/scripts/generate_image.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +""" +Together AI Image Generation -- Text-to-Image and FLUX.2 (v2 SDK) + +Generate images from text prompts, save locally, create variations, +and use FLUX.2 reference images. + +Usage: + python generate_image.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +import base64 +from together import Together + +client = Together() + + +def generate_image_url( + prompt: str, + model: str = "black-forest-labs/FLUX.2-dev", + width: int = 1024, + height: int = 1024, + steps: int = 20, + n: int = 1, + seed: int | None = None, +) -> list[str]: + """Generate image(s) and return URL(s).""" + kwargs: dict = dict( + model=model, + prompt=prompt, + width=width, + height=height, + steps=steps, + n=n, + ) + if seed is not None: + kwargs["seed"] = seed + + response = client.images.generate(**kwargs) + urls = [img.url for img in response.data] + for i, url in enumerate(urls): + print(f" Image {i}: {url}") + return urls + + +def generate_and_save( + prompt: str, + output_path: str = "output.png", + model: str = "black-forest-labs/FLUX.2-dev", + width: int = 1024, + height: int = 1024, + steps: int = 20, +) -> str: + """Generate an image and save it locally via base64.""" + response = client.images.generate( + model=model, + prompt=prompt, + width=width, + height=height, + steps=steps, + n=1, + response_format="base64", + ) + image_data = base64.b64decode(response.data[0].b64_json) + + with open(output_path, "wb") as f: + f.write(image_data) + + print(f" Saved to {output_path} ({len(image_data)} bytes)") + return output_path + + +def generate_flux2( + prompt: str, + model: str = "black-forest-labs/FLUX.2-pro", + width: int = 1024, + height: int = 768, + reference_images: list[str] | None = None, + prompt_upsampling: bool = True, + output_format: str = "png", +) -> str: + """Generate with FLUX.2 features (prompt upsampling, reference images).""" + kwargs: dict = dict( + model=model, + prompt=prompt, + width=width, + height=height, + prompt_upsampling=prompt_upsampling, + output_format=output_format, + ) + if reference_images: + kwargs["reference_images"] = reference_images + + response = client.images.generate(**kwargs) + url = response.data[0].url + print(f" FLUX.2 image: {url}") + return url + + +if __name__ == "__main__": + # --- Example 1: Basic text-to-image --- + print("=== Basic Generation ===") + generate_image_url( + prompt="A serene mountain landscape at sunset, digital art", + steps=20, + ) + + # --- Example 2: Save locally --- + print("\n=== Save to File ===") + generate_and_save( + prompt="A futuristic city skyline with flying cars", + output_path="city.png", + steps=20, + ) + + # --- Example 3: Multiple variations --- + print("\n=== 3 Variations ===") + generate_image_url( + prompt="A cute robot reading a book", + n=3, + steps=20, + ) + + # --- Example 4: Reproducible with seed --- + print("\n=== Reproducible (seed=42) ===") + generate_image_url( + prompt="Abstract geometric pattern in blue and gold", + seed=42, + steps=20, + ) + + # --- Example 5: FLUX.2 with prompt upsampling --- + print("\n=== FLUX.2 Pro ===") + generate_flux2( + prompt="A mountain landscape at sunset with golden light reflecting on a calm lake", + ) + + # --- Example 6: FLUX.2 with reference image --- + # print("\n=== FLUX.2 Reference Image ===") + # generate_flux2( + # prompt="Replace the color of the car to blue", + # reference_images=["https://images.pexels.com/photos/3729464/pexels-photo-3729464.jpeg"], + # ) diff --git a/plugins/togetherai/skills/together-images/scripts/generate_image.ts b/plugins/togetherai/skills/together-images/scripts/generate_image.ts new file mode 100644 index 00000000..1746f987 --- /dev/null +++ b/plugins/togetherai/skills/together-images/scripts/generate_image.ts @@ -0,0 +1,90 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Image Generation -- Text-to-Image, Editing, FLUX.2 + * + * Generate images from text, edit with Kontext, use FLUX.2 features. + * + * Usage: + * npx tsx generate_image.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +async function basicGeneration(): Promise { + console.log("=== Basic Generation (FLUX.2 Dev) ==="); + const response = await client.images.generate({ + model: "black-forest-labs/FLUX.2-dev", + prompt: "A serene mountain landscape at sunset with a lake reflection", + steps: 20, + }); + console.log(` Image URL: ${response.data[0].url}`); +} + +async function flux2Generation(): Promise { + console.log("\n=== FLUX.2 Pro ==="); + const response = await client.images.generate({ + model: "black-forest-labs/FLUX.2-pro", + prompt: "A mountain landscape at sunset with golden light reflecting on a calm lake", + width: 1024, + height: 768, + prompt_upsampling: true, + output_format: "png", + }); + console.log(` Image URL: ${response.data[0].url}`); +} + +async function kontextEditing(): Promise { + console.log("\n=== Kontext Image Editing ==="); + const response = await client.images.generate({ + model: "black-forest-labs/FLUX.1-kontext-pro", + prompt: "Transform this into a watercolor painting", + image_url: "https://cdn.pixabay.com/photo/2020/05/20/08/27/cat-5195431_1280.jpg", + width: 1024, + height: 768, + steps: 28, + }); + console.log(` Edited image: ${response.data[0].url}`); +} + +async function multipleVariations(): Promise { + console.log("\n=== Multiple Variations ==="); + const response = await client.images.generate({ + model: "black-forest-labs/FLUX.2-dev", + prompt: "A cute robot assistant helping in a modern office", + n: 4, + steps: 20, + }); + for (let i = 0; i < response.data.length; i++) { + console.log(` Variation ${i + 1}: ${response.data[i].url}`); + } +} + +async function base64Response(): Promise { + console.log("\n=== Base64 Response ==="); + const response = await client.images.generate({ + model: "black-forest-labs/FLUX.2-dev", + prompt: "A cat in outer space", + steps: 20, + response_format: "base64", + }); + const data = response.data[0].b64_json ?? ""; + console.log(` Base64 length: ${data.length} chars`); +} + +async function main(): Promise { + await basicGeneration(); + await flux2Generation(); + await kontextEditing(); + await multipleVariations(); + await base64Response(); +} + +main(); diff --git a/plugins/togetherai/skills/together-images/scripts/kontext_editing.py b/plugins/togetherai/skills/together-images/scripts/kontext_editing.py new file mode 100644 index 00000000..632167d4 --- /dev/null +++ b/plugins/togetherai/skills/together-images/scripts/kontext_editing.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +""" +Together AI Kontext -- Image Editing with Text+Image Prompts (v2 SDK) + +Edit existing images using FLUX.1 Kontext models: style transfer, +object modification, scene transformation, and character creation. + +Usage: + python kontext_editing.py + +Requires: + uv pip install "together>=2.0.0" requests + export TOGETHER_API_KEY=your_key +""" + +import base64 + +import requests +from together import Together + +client = Together() + +KONTEXT_PRO = "black-forest-labs/FLUX.1-kontext-pro" +KONTEXT_MAX = "black-forest-labs/FLUX.1-kontext-max" + + +def edit_image( + prompt: str, + image_url: str, + model: str = KONTEXT_PRO, + width: int = 1024, + height: int = 1024, + steps: int = 28, + seed: int | None = None, +) -> str: + """Edit an existing image using a text prompt. Returns a URL.""" + kwargs: dict = dict( + model=model, + prompt=prompt, + image_url=image_url, + width=width, + height=height, + steps=steps, + ) + if seed is not None: + kwargs["seed"] = seed + + response = client.images.generate(**kwargs) + url = response.data[0].url + print(f" Edited image: {url}") + return url + + +def edit_and_save( + prompt: str, + image_url: str, + output_path: str = "edited.png", + model: str = KONTEXT_PRO, + width: int = 1024, + height: int = 1024, + steps: int = 28, + seed: int | None = None, +) -> str: + """Edit an image and save the result locally via base64.""" + kwargs: dict = dict( + model=model, + prompt=prompt, + image_url=image_url, + width=width, + height=height, + steps=steps, + response_format="base64", + n=1, + ) + if seed is not None: + kwargs["seed"] = seed + + response = client.images.generate(**kwargs) + image_data = base64.b64decode(response.data[0].b64_json) + + with open(output_path, "wb") as f: + f.write(image_data) + + print(f" Saved to {output_path} ({len(image_data):,} bytes)") + return output_path + + +def download_image(url: str, output_path: str) -> str: + """Download an image from a URL and save it locally.""" + resp = requests.get(url, timeout=60) + resp.raise_for_status() + with open(output_path, "wb") as f: + f.write(resp.content) + print(f" Downloaded {output_path} ({len(resp.content):,} bytes)") + return output_path + + +def style_transfer(image_url: str, style: str, **kwargs) -> str: + """Apply a style to an image (watercolor, oil painting, etc.).""" + print(f" Style: {style}") + return edit_image( + prompt=f"Transform this into a {style}", + image_url=image_url, + **kwargs, + ) + + +def modify_object(image_url: str, modification: str, **kwargs) -> str: + """Modify a specific object or attribute in an image.""" + print(f" Modification: {modification}") + return edit_image( + prompt=modification, + image_url=image_url, + **kwargs, + ) + + +if __name__ == "__main__": + # Source image for editing examples + SOURCE_IMAGE = "https://cdn.pixabay.com/photo/2020/05/20/08/27/cat-5195431_1280.jpg" + + # --- Example 1: Style transfer --- + print("=== Style Transfer ===") + style_transfer(SOURCE_IMAGE, "watercolor painting") + + # --- Example 2: Object modification --- + print("\n=== Object Modification ===") + modify_object(SOURCE_IMAGE, "Make the cat wear a tiny top hat") + + # --- Example 3: Scene transformation --- + print("\n=== Scene Transformation ===") + edit_image( + prompt="Place this cat in a snowy winter landscape", + image_url=SOURCE_IMAGE, + width=1344, + height=768, + ) + + # --- Example 4: Landscape aspect ratio --- + print("\n=== Landscape Edit ===") + edit_image( + prompt="Transform this into a pencil sketch", + image_url=SOURCE_IMAGE, + width=1536, + height=1024, + ) + + # --- Example 5: Reproducible edit --- + print("\n=== Reproducible Edit (seed=42) ===") + edit_image( + prompt="Make this a pop art poster", + image_url=SOURCE_IMAGE, + seed=42, + ) + + # --- Example 6: Edit and save locally --- + print("\n=== Edit and Save Locally ===") + edit_and_save( + prompt="Change the background to a tropical beach at sunset", + image_url=SOURCE_IMAGE, + output_path="cat_beach.png", + ) + + # --- Example 7: Generate-then-edit pipeline --- + # Generate an image with FLUX, then refine the background with Kontext. + # This is the most common multi-step image workflow (e.g. product photos). + print("\n=== Generate-then-Edit Pipeline ===") + print("Step 1: Generate base image with FLUX") + gen_response = client.images.generate( + model="black-forest-labs/FLUX.1-schnell", + prompt="A white ceramic vase with dried flowers on a wooden table", + width=1024, + height=1024, + steps=4, + n=1, + ) + base_url = gen_response.data[0].url + download_image(base_url, "vase_original.png") + + print("Step 2: Edit background with Kontext") + edit_and_save( + prompt="Change the background to a smooth gradient studio backdrop, " + "keep the vase and flowers exactly the same", + image_url=base_url, + output_path="vase_studio.png", + ) diff --git a/plugins/togetherai/skills/together-images/scripts/lora_generation.py b/plugins/togetherai/skills/together-images/scripts/lora_generation.py new file mode 100644 index 00000000..cddcaf5c --- /dev/null +++ b/plugins/togetherai/skills/together-images/scripts/lora_generation.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +Together AI LoRA Image Generation -- Apply LoRA Adapters to FLUX (v2 SDK) + +Generate images with custom LoRA adapters for unique styles. +Supports up to 2 LoRAs per image from Hugging Face, CivitAI, +Replicate, or direct .safetensors URLs. + +Usage: + python lora_generation.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +from together import Together + +client = Together() + +LORA_MODEL = "black-forest-labs/FLUX.2-dev" + + +def generate_with_lora( + prompt: str, + loras: list[dict], + width: int = 1024, + height: int = 768, + steps: int = 28, + seed: int | None = None, +) -> str: + """Generate an image with LoRA adapters applied.""" + kwargs: dict = dict( + model=LORA_MODEL, + prompt=prompt, + width=width, + height=height, + steps=steps, + image_loras=loras, + ) + if seed is not None: + kwargs["seed"] = seed + + response = client.images.generate(**kwargs) + url = response.data[0].url + print(f" Image: {url}") + return url + + +if __name__ == "__main__": + # --- Example 1: Single LoRA (realism) --- + print("=== Realism LoRA ===") + generate_with_lora( + prompt="a professional photograph of a woman in a modern cafe", + loras=[ + { + "path": "https://huggingface.co/XLabs-AI/flux-RealismLora", + "scale": 0.8, + }, + ], + ) + + # --- Example 2: Two LoRAs combined --- + print("\n=== Two LoRAs Combined ===") + generate_with_lora( + prompt="a BLKLGHT image of a man walking outside on a rainy day", + loras=[ + { + "path": "https://replicate.com/fofr/flux-black-light", + "scale": 0.8, + }, + { + "path": "https://huggingface.co/XLabs-AI/flux-RealismLora", + "scale": 0.5, + }, + ], + ) + + # --- Example 3: Different scales --- + print("\n=== Subtle LoRA (low scale) ===") + generate_with_lora( + prompt="a portrait photo of a young man in golden hour lighting", + loras=[ + { + "path": "https://huggingface.co/XLabs-AI/flux-RealismLora", + "scale": 0.3, + }, + ], + ) + + print("\n=== Strong LoRA (high scale) ===") + generate_with_lora( + prompt="a portrait photo of a young man in golden hour lighting", + loras=[ + { + "path": "https://huggingface.co/XLabs-AI/flux-RealismLora", + "scale": 1.2, + }, + ], + ) diff --git a/plugins/togetherai/skills/together-sandboxes/SKILL.md b/plugins/togetherai/skills/together-sandboxes/SKILL.md new file mode 100644 index 00000000..8596484a --- /dev/null +++ b/plugins/togetherai/skills/together-sandboxes/SKILL.md @@ -0,0 +1,68 @@ +--- +name: together-sandboxes +description: "Remote Python execution in managed sandboxes on Together AI with stateful sessions, file uploads, data analysis, chart generation, and notebook-like runs via the Sandboxes API. Reach for it whenever the user wants managed remote Python execution instead of local execution, raw clusters, or full model hosting." +--- + +# Together Sandboxes + +## Overview + +Use Together Sandboxes when the user wants to execute Python remotely in a managed sandbox. + +Typical fits: + +- stateful Python sessions +- data analysis and chart generation +- agent-generated code execution +- file uploads into a remote runtime + +## When This Skill Wins + +- The user wants remote execution rather than local shell execution +- Session state needs to persist across multiple calls +- The result may include display outputs such as charts +- A lightweight managed runtime is enough; no custom infra is required + +## Hand Off To Another Skill + +- Use `together-gpu-clusters` for full infrastructure control or larger distributed jobs +- Use `together-dedicated-containers` for custom containerized runtime logic +- Use `together-chat-completions` if the user only wants generated code, not executed code + +## Quick Routing + +- Remote execution with session reuse + - Start with [scripts/execute_with_session.py](scripts/execute_with_session.py) or [scripts/execute_with_session.ts](scripts/execute_with_session.ts) +- Response schema and session listing + - Read [references/api-reference.md](references/api-reference.md) +- MCP-style access for agent workflows + - Read [references/api-reference.md](references/api-reference.md) + +## Workflow + +1. Decide whether the task needs code execution or only code generation. +2. Start a session with `client.code_interpreter.execute()`. +3. Reuse `session_id` when the workflow depends on prior state. +4. Inspect `stdout`, `stderr`, structured outputs, and display outputs separately. +5. List sessions only when the user needs operational visibility or cleanup. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Treat `session_id` as part of the workflow state. +- Inspect `response.errors` before assuming a run succeeded. +- `plt.show()` with the Agg backend does not reliably produce `display_data` outputs. To retrieve charts, save the figure to a `BytesIO` buffer with `fig.savefig()`, base64-encode it, and print the encoded string to stdout. Parse it from the `stdout` output on the client side. See the chart example in [scripts/execute_with_session.py](scripts/execute_with_session.py). +- Use this skill when the user benefits from remote stateful execution, not just because Python is involved. +- If the task outgrows the sandbox model, hand off to GPU clusters or dedicated containers. + +## Resource Map + +- API reference: [references/api-reference.md](references/api-reference.md) +- Alternative access patterns: [references/api-reference.md](references/api-reference.md) +- Python workflow: [scripts/execute_with_session.py](scripts/execute_with_session.py) +- TypeScript workflow: [scripts/execute_with_session.ts](scripts/execute_with_session.ts) + +## Official Docs + +- [Together Sandboxes](https://docs.together.ai/docs/together-code-interpreter) +- [Sandboxes API](https://docs.together.ai/reference/tci-execute) diff --git a/plugins/togetherai/skills/together-sandboxes/references/api-reference.md b/plugins/togetherai/skills/together-sandboxes/references/api-reference.md new file mode 100644 index 00000000..87f9e58c --- /dev/null +++ b/plugins/togetherai/skills/together-sandboxes/references/api-reference.md @@ -0,0 +1,276 @@ +# Sandboxes API Reference +## Contents + +- [Endpoints](#endpoints) +- [Execute Code](#execute-code) +- [List Sessions](#list-sessions) +- [Pre-installed Packages](#pre-installed-packages) +- [Pricing](#pricing) +- [Alternative Access](#alternative-access) + + +## Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /tci/execute` | Execute code | Run a code snippet in a sandboxed session | +| `GET /tci/sessions` | List sessions | List all active sessions | + +Base URL: `https://api.together.ai` +Authentication: `Authorization: Bearer $TOGETHER_API_KEY` + +## Execute Code + +### Request (ExecuteRequest) + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `language` | string | Yes | Programming language (`python`) | +| `code` | string | Yes | Code snippet to execute | +| `session_id` | string | No | Reuse an existing session for persistent state | +| `files` | array | No | Files to upload before execution | + +### File Object + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `name` | string | Yes | File name (e.g., `data.py`) | +| `encoding` | string | Yes | `string` or `base64` | +| `content` | string | Yes | File content | + +### Response (ExecuteResponse) + +| Field | Type | Description | +|-------|------|-------------| +| `data.session_id` | string | Session ID for follow-up calls | +| `data.status` | string | `success` | +| `data.outputs` | array | Execution results (see Output Types) | +| `errors` | array or null | Error details if execution failed | + +### Output Types + +| Type | Description | Data format | +|------|-------------|-------------| +| `stdout` | Standard output | string | +| `stderr` | Standard error | string | +| `error` | Exception/failure | string | +| `display_data` | Rich output | object (see below) | +| `execute_result` | Expression result | object (see below) | + +display_data / execute_result data object may contain: + +| Key | Description | +|-----|-------------| +| `application/json` | JSON data | +| `text/html` | HTML content | +| `text/markdown` | Markdown content | +| `text/latex` | LaTeX content | +| `image/png` | Base64-encoded PNG | +| `image/jpeg` | Base64-encoded JPEG | +| `image/gif` | Base64-encoded GIF | +| `image/svg+xml` | SVG content | +| `application/pdf` | Base64-encoded PDF | +| `application/vnd.vegalite.v5+json` | Vega-Lite visualization | +| `application/vnd.vega.v5+json` | Vega visualization | +| `application/geo+json` | GeoJSON data | + +### Examples + +```python +from together import Together +client = Together() + +response = client.code_interpreter.execute( + code='print("Hello world!")', + language="python", +) +print(response.data.outputs[0].data) +``` + +```typescript +import Together from "together-ai"; +const client = new Together(); + +const response = await client.codeInterpreter.execute({ + code: 'print("Hello world!")', + language: "python", +}); +console.log(response.data?.outputs?.[0]?.data); +``` + +```shell +curl -X POST "https://api.together.ai/tci/execute" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"code": "print(\"Hello world!\")", "language": "python"}' +``` + +### Session Reuse + +```python +# First call creates a session +response1 = client.code_interpreter.execute(code="x = 42", language="python") +session_id = response1.data.session_id + +# Second call reuses state +response2 = client.code_interpreter.execute( + code='print(f"x = {x}")', + language="python", + session_id=session_id, +) +``` + +```typescript +const response1 = await client.codeInterpreter.execute({ + code: "x = 42", + language: "python", +}); +const sessionId = response1.data.session_id; + +const response2 = await client.codeInterpreter.execute({ + code: 'print(f"x = {x}")', + language: "python", + session_id: sessionId, +}); +``` + +```shell +# Use session_id from first response in subsequent calls +curl -X POST "https://api.together.ai/tci/execute" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"language": "python", "code": "x = 42"}' + +curl -X POST "https://api.together.ai/tci/execute" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"language": "python", "code": "print(f\"x = {x}\")", "session_id": "ses_..."}' +``` + +### File Upload + +```python +response = client.code_interpreter.execute( + code="!python myscript.py", + language="python", + files=[{ + "name": "myscript.py", + "encoding": "string", + "content": "import sys\nprint(f'Hello from {sys.argv[0]}!')", + }], +) +``` + +```typescript +const response = await client.codeInterpreter.execute({ + code: "!python myscript.py", + language: "python", + files: [{ + name: "myscript.py", + encoding: "string", + content: "import sys\nprint(f'Hello from {sys.argv[0]}!')", + }], +}); +``` + +```shell +curl -X POST "https://api.together.ai/tci/execute" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "language": "python", + "files": [{"name": "myscript.py", "encoding": "string", "content": "print(\"hello\")"}], + "code": "!python myscript.py" + }' +``` + +### Retrieving Charts + +`plt.show()` with the Agg backend does not reliably produce `display_data` outputs containing +`image/png`. To get chart images back to the client, save explicitly and base64-encode via stdout: + +```python +# --- Remote code (runs in the sandbox) --- +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import base64, io + +fig, ax = plt.subplots() +ax.bar(["Jan", "Feb", "Mar"], [100, 150, 130]) + +buf = io.BytesIO() +fig.savefig(buf, format="png", dpi=150) +buf.seek(0) +print("chart_b64:" + base64.b64encode(buf.read()).decode()) +plt.close(fig) +``` + +```python +# --- Client side --- +import base64 + +for output in response.data.outputs: + if output.type == "stdout" and "chart_b64:" in output.data: + b64 = output.data.split("chart_b64:", 1)[1].strip() + with open("chart.png", "wb") as f: + f.write(base64.b64decode(b64)) +``` + +If the API does return a `display_data` output with an `image/png` key, prefer that over stdout +parsing. Check both paths for maximum reliability. + +## List Sessions + +### Response (SessionListResponse) + +| Field | Type | Description | +|-------|------|-------------| +| `data.sessions` | array | List of active session objects | + +### Session Object + +| Field | Type | Description | +|-------|------|-------------| +| `id` | string | Session identifier (e.g., `ses_abcDEF123`) | +| `execute_count` | integer | Number of executions in this session | +| `started_at` | datetime | Session start timestamp | +| `last_execute_at` | datetime | Most recent execution timestamp | +| `expires_at` | datetime | Session expiration timestamp | + +### Examples + +```python +response = client.code_interpreter.sessions.list() +for session in response.data.sessions: + print(f"{session.id}: {session.execute_count} executions, expires {session.expires_at}") +``` + +```typescript +const response = await client.codeInterpreter.sessions.list(); +for (const session of response.data?.sessions ?? []) { + console.log(`${session.id}: ${session.execute_count} executions`); +} +``` + +```shell +curl -X GET "https://api.together.ai/tci/sessions" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" +``` + +## Pre-installed Packages + +aiohttp, beautifulsoup4, bokeh, gensim, imageio, joblib, librosa, matplotlib, nltk, numpy, opencv-python, openpyxl, pandas, plotly, pytest, python-docx, pytz, requests, scikit-image, scikit-learn, scipy, seaborn, soundfile, spacy, sympy, textblob, tornado, urllib3, xarray, xlrd + +Install additional packages at runtime with `!pip install `. + +## Pricing + +$0.03 per session. Sessions last 60 minutes and support multiple executions. + +## Alternative Access + +Together AI also exposes MCP-compatible tooling for agent workflows that prefer MCP over direct API +calls. Use the direct TCI API when you need explicit SDK control over sessions, files, and response +objects; use MCP when the surrounding agent framework already speaks MCP. diff --git a/plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.py b/plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.py new file mode 100644 index 00000000..e56d7509 --- /dev/null +++ b/plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +""" +Together AI Code Interpreter - Execute Code with Session Reuse (v2 SDK) + +Run Python code in a sandboxed environment, reuse sessions to persist state, +and handle file outputs. + +Usage: + python execute_with_session.py + +Requires: + uv pip install "together>=2.0.0" + export TOGETHER_API_KEY=your_key +""" + +from together import Together + +client = Together() + + +def execute_code(code: str, session_id: str | None = None) -> dict: + """Execute Python code, optionally in an existing session.""" + response = client.code_interpreter.execute( + code=code, + language="python", + **({"session_id": session_id} if session_id else {}), + ) + + if response.errors: + print(f"Errors: {response.errors}") + return {"session_id": None, "outputs": [], "errors": response.errors} + + outputs = [] + for output in response.data.outputs: + if output.type in ("stdout", "stderr"): + print(f" [{output.type}] {output.data}") + outputs.append({"type": output.type, "data": output.data}) + elif output.type == "error": + print(f" [error] {output.data}") + outputs.append({"type": "error", "data": output.data}) + elif output.type in ("display_data", "execute_result"): + print(f" [{output.type}] {list(output.data.keys()) if isinstance(output.data, dict) else output.data}") + outputs.append({"type": output.type, "data": output.data}) + + return { + "session_id": response.data.session_id, + "outputs": outputs, + "errors": None, + } + + +def list_sessions(): + """List active code interpreter sessions.""" + response = client.code_interpreter.sessions.list() + for s in response.data.sessions: + print(f" Session {s.id}: {s.execute_count} executions, expires {s.expires_at}") + return response.data.sessions + + +if __name__ == "__main__": + # --- Example 1: Single execution --- + print("=== Single execution ===") + result = execute_code("print('Hello from Together Code Interpreter!')") + session_id = result["session_id"] + print(f"Session ID: {session_id}\n") + + # --- Example 2: Reuse session (state persists) --- + print("=== Session reuse - define variable ===") + execute_code("x = 42\nprint(f'Set x = {x}')", session_id=session_id) + + print("\n=== Session reuse - access variable ===") + execute_code("print(f'x is still {x}')", session_id=session_id) + + # --- Example 3: Data analysis with packages --- + print("\n=== Data analysis ===") + execute_code( + """ +import numpy as np + +data = np.random.randn(1000) +print(f"Mean: {data.mean():.4f}") +print(f"Std: {data.std():.4f}") +print(f"Min: {data.min():.4f}") +print(f"Max: {data.max():.4f}") +""", + session_id=session_id, + ) + + # --- Example 4: Generate a chart and retrieve as base64 PNG --- + # plt.show() with the Agg backend does not reliably produce display_data + # outputs. Instead, save the figure to a buffer and encode it explicitly. + print("\n=== Chart generation ===") + chart_result = execute_code( + """ +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np +import base64, io + +x = np.linspace(0, 10, 100) +fig, ax = plt.subplots(figsize=(8, 4)) +ax.plot(x, np.sin(x), label='sin(x)') +ax.plot(x, np.cos(x), label='cos(x)') +ax.legend() +ax.set_title('Trig Functions') + +buf = io.BytesIO() +fig.savefig(buf, format='png', dpi=150) +buf.seek(0) +png_b64 = base64.b64encode(buf.read()).decode() +plt.close(fig) +print(f"chart_png_base64:{png_b64}") +""", + session_id=session_id, + ) + + # Extract and save the chart locally + import base64 as _b64 + + for out in chart_result["outputs"]: + if out["type"] == "stdout" and "chart_png_base64:" in out["data"]: + b64_str = out["data"].split("chart_png_base64:", 1)[1].strip() + with open("trig_chart.png", "wb") as f: + f.write(_b64.b64decode(b64_str)) + print(f" Chart saved to trig_chart.png ({len(b64_str)} bytes b64)") + + # --- List active sessions --- + print("\n=== Active sessions ===") + list_sessions() diff --git a/plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.ts b/plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.ts new file mode 100644 index 00000000..d777c311 --- /dev/null +++ b/plugins/togetherai/skills/together-sandboxes/scripts/execute_with_session.ts @@ -0,0 +1,131 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Code Interpreter - Execute Code with Session Reuse (TypeScript SDK) + * + * Run Python code in a sandboxed environment, reuse sessions to persist state, + * upload files, and handle display outputs. + * + * Usage: + * npx tsx execute_with_session.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import Together from "together-ai"; + +const client = new Together(); + +async function executeCode( + code: string, + sessionId?: string, + files?: { name: string; encoding: string; content: string }[] +): Promise<{ sessionId: string | null; outputs: any[] }> { + const params: any = { code, language: "python" }; + if (sessionId) params.session_id = sessionId; + if (files) params.files = files; + + const response = await client.codeInterpreter.execute(params); + + if (response.errors) { + console.error(`Errors: ${JSON.stringify(response.errors)}`); + return { sessionId: null, outputs: [] }; + } + + const outputs: any[] = []; + for (const output of response.data.outputs) { + if (output.type === "stdout" || output.type === "stderr") { + console.log(` [${output.type}] ${output.data}`); + } else if (output.type === "error") { + console.log(` [error] ${output.data}`); + } else if (output.type === "display_data" || output.type === "execute_result") { + const keys = + typeof output.data === "object" ? Object.keys(output.data) : []; + console.log(` [${output.type}] ${JSON.stringify(keys)}`); + } + outputs.push({ type: output.type, data: output.data }); + } + + return { sessionId: response.data.session_id, outputs }; +} + +async function listSessions() { + const response = await client.codeInterpreter.sessions.list(); + for (const session of response.data?.sessions ?? []) { + console.log( + ` Session ${session.id}: ${session.execute_count} executions, expires ${session.expires_at}` + ); + } +} + +async function main() { + // --- Example 1: Single execution --- + console.log("=== Single execution ==="); + const result = await executeCode( + 'print("Hello from Together Code Interpreter!")' + ); + const sessionId = result.sessionId!; + console.log(`Session ID: ${sessionId}\n`); + + // --- Example 2: Reuse session (state persists) --- + console.log("=== Session reuse -- define variable ==="); + await executeCode("x = 42\nprint(f'Set x = {x}')", sessionId); + + console.log("\n=== Session reuse -- access variable ==="); + await executeCode("print(f'x is still {x}')", sessionId); + + // --- Example 3: Data analysis --- + console.log("\n=== Data analysis ==="); + await executeCode( + ` +import numpy as np + +data = np.random.randn(1000) +print(f"Mean: {data.mean():.4f}") +print(f"Std: {data.std():.4f}") +print(f"Min: {data.min():.4f}") +print(f"Max: {data.max():.4f}") +`, + sessionId + ); + + // --- Example 4: Chart generation --- + console.log("\n=== Chart generation ==="); + await executeCode( + ` +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np + +x = np.linspace(0, 10, 100) +plt.figure(figsize=(8, 4)) +plt.plot(x, np.sin(x), label='sin(x)') +plt.plot(x, np.cos(x), label='cos(x)') +plt.legend() +plt.title('Trig Functions') +plt.show() +`, + sessionId + ); + + // --- Example 5: File upload --- + console.log("\n=== File upload ==="); + await executeCode("!python myscript.py", undefined, [ + { + name: "myscript.py", + encoding: "string", + content: "import sys\nprint(f'Hello from {sys.argv[0]}!')", + }, + ]); + + // --- List active sessions --- + console.log("\n=== Active sessions ==="); + await listSessions(); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/plugins/togetherai/skills/together-video/SKILL.md b/plugins/togetherai/skills/together-video/SKILL.md new file mode 100644 index 00000000..ab633323 --- /dev/null +++ b/plugins/togetherai/skills/together-video/SKILL.md @@ -0,0 +1,70 @@ +--- +name: together-video +description: "Text-to-video and image-to-video generation via Together AI, including keyframe control, model and dimension selection, asynchronous job polling, and video downloads. Reach for it whenever the user wants motion generation on Together AI rather than still-image generation or text-only inference." +--- + +# Together Video + +## Overview + +Use Together AI video APIs for: + +- text-to-video generation +- image-to-video generation +- first-frame and last-frame keyframe control +- asynchronous job polling +- local download of completed outputs + +## When This Skill Wins + +- Generate short videos from prompts +- Animate an existing image +- Choose among Veo, Sora, Kling, Seedance, PixVerse, Vidu, or other supported models +- Add polling and download logic to a product or script + +## Hand Off To Another Skill + +- Use `together-images` for still-image generation or editing +- Use `together-dedicated-containers` only when a custom video-serving runtime is required + +## Quick Routing + +- Text-to-video generation + - Start with [scripts/generate_video.py](scripts/generate_video.py) or [scripts/generate_video.ts](scripts/generate_video.ts) + - Read [references/api-reference.md](references/api-reference.md) +- Image-to-video with keyframes + - Start with [scripts/image_to_video.py](scripts/image_to_video.py) + - Read [references/api-reference.md](references/api-reference.md) +- Parameter tuning, polling, or troubleshooting + - Read [references/api-reference.md](references/api-reference.md) +- Model, dimension, and prompt-limit selection + - Read [references/models.md](references/models.md) + +## Workflow + +1. Confirm whether the user needs text-to-video or image-to-video. +2. Choose the model based on duration, dimension, keyframe support, and audio support. +3. Submit the async job and poll until a terminal state. +4. Download the result promptly before signed URLs expire. + +## High-Signal Rules + +- Python scripts require the Together v2 SDK (`together>=2.0.0`). If the user is on an older version, they must upgrade first: `uv pip install --upgrade "together>=2.0.0"`. +- Together video generation is asynchronous; do not treat it like a synchronous image call. +- Keyframe support is model-specific. Validate support before promising first-plus-last-frame control. +- Keep polling and download logic as part of the workflow, not as an afterthought. +- Use explicit dimensions and generation parameters rather than relying on unstable defaults. + +## Resource Map + +- API reference: [references/api-reference.md](references/api-reference.md) +- Polling, parameter tuning, and troubleshooting: [references/api-reference.md](references/api-reference.md) +- Model guide: [references/models.md](references/models.md) +- Python text-to-video workflow: [scripts/generate_video.py](scripts/generate_video.py) +- TypeScript text-to-video workflow: [scripts/generate_video.ts](scripts/generate_video.ts) +- Python image-to-video workflow: [scripts/image_to_video.py](scripts/image_to_video.py) + +## Official Docs + +- [Videos Overview](https://docs.together.ai/docs/videos-overview) +- [Create Video API](https://docs.together.ai/reference/create-videos) diff --git a/plugins/togetherai/skills/together-video/references/api-reference.md b/plugins/togetherai/skills/together-video/references/api-reference.md new file mode 100644 index 00000000..197ea074 --- /dev/null +++ b/plugins/togetherai/skills/together-video/references/api-reference.md @@ -0,0 +1,312 @@ +# Video Generation API Reference +## Contents + +- [Endpoints](#endpoints) +- [Create Video](#create-video) +- [Get Video Status](#get-video-status) +- [Job Statuses](#job-statuses) +- [Polling Pattern](#polling-pattern) +- [Guidance Scale](#guidance-scale) +- [Steps](#steps) +- [Troubleshooting](#troubleshooting) + + +## Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| `POST /v2/videos` | Create video | Submit a video generation job | +| `GET /v2/videos/{id}` | Get video status | Poll for job completion | + +Base URL: `https://api.together.xyz` + +## Create Video + +### Request + +```python +from together import Together +client = Together() + +job = client.videos.create( + prompt="A serene sunset over the ocean with gentle waves", + model="minimax/video-01-director", + width=1366, + height=768, +) +print(job.id) +``` + +```typescript +import Together from "together-ai"; +const client = new Together(); + +const job = await client.videos.create({ + prompt: "A serene sunset over the ocean with gentle waves", + model: "minimax/video-01-director", + width: 1366, + height: 768, +}); +console.log(job.id); +``` + +```shell +curl -X POST "https://api.together.xyz/v2/videos" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "minimax/video-01-director", + "prompt": "A serene sunset over the ocean with gentle waves", + "width": 1366, + "height": 768 + }' +``` + +### Parameters + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `model` | string | Yes | - | Model identifier | +| `prompt` | string | Yes* | - | Text description (1-32,000 chars) | +| `width` | integer | No | 1366 | Video width in pixels | +| `height` | integer | No | 768 | Video height in pixels | +| `seconds` | string | No | varies | Clip duration override. Supported values depend on the model. | +| `fps` | integer | No | 24 | Frames per second (15-60) | +| `steps` | integer | No | varies | Diffusion steps (10-50) | +| `guidance_scale` | float | No | varies | Prompt adherence (6.0-10.0, avoid >12) | +| `seed` | integer | No | random | Random seed for reproducibility | +| `negative_prompt` | string | No | - | Elements to exclude | +| `frame_images` | array | No | - | Keyframe images: `[{input_image, frame}]` | +| `reference_images` | array | No | - | Style reference image URLs | +| `output_format` | string | No | `"MP4"` | `"MP4"` or `"WEBM"` | +| `output_quality` | integer | No | 20 | Compression quality (lower = higher quality) | + +*Prompt not required for Kling 2.1 Standard/Pro and Kling 1.6 Pro. + +### frame_images Schema + +Each element in the `frame_images` array: + +| Field | Type | Description | +|-------|------|-------------| +| `input_image` | string | Image URL or base64-encoded image data | +| `frame` | number or string | Frame index: `0`, `"first"`, or `"last"` | + +Frame number calculation: `frame = seconds x fps` (for specific frame positions). + +### Advanced Example + +```python +job = client.videos.create( + prompt="A futuristic city at night with neon lights reflecting on wet streets", + model="minimax/hailuo-02", + width=1366, + height=768, + seconds="6", + fps=30, + steps=30, + guidance_scale=8.0, + output_format="MP4", + output_quality=20, + seed=42, + negative_prompt="blurry, low quality, distorted", +) +``` + +```typescript +const job = await client.videos.create({ + prompt: "A futuristic city at night with neon lights reflecting on wet streets", + model: "minimax/hailuo-02", + width: 1366, + height: 768, + seconds: "6", + fps: 30, + steps: 30, + guidance_scale: 8.0, + output_format: "MP4", + output_quality: 20, + seed: 42, + negative_prompt: "blurry, low quality, distorted", +}); +``` + +### Keyframe Example + +```python +job = client.videos.create( + prompt="Smooth camera zoom out revealing a vast landscape", + model="minimax/hailuo-02", + width=1366, + height=768, + frame_images=[{ + "input_image": "https://cdn.pixabay.com/photo/2020/05/20/08/27/cat-5195431_1280.jpg", + "frame": "first", + }], +) +``` + +```typescript +const job = await client.videos.create({ + prompt: "Smooth camera zoom out revealing a vast landscape", + model: "minimax/hailuo-02", + width: 1366, + height: 768, + frame_images: [{ + input_image: "https://cdn.pixabay.com/photo/2020/05/20/08/27/cat-5195431_1280.jpg", + frame: "first", + }], +}); +``` + +### Reference Images Example + +```python +job = client.videos.create( + prompt="A cat dancing energetically", + model="vidu/vidu-2.0", + width=1280, + height=720, + reference_images=[ + "https://cdn.pixabay.com/photo/2020/05/20/08/27/cat-5195431_1280.jpg", + ], +) +``` + +### Create Response + +```json +{ + "id": "019a0068-794a-7213-90f6-cc4eb62e3da7", + "object": "video", + "model": "minimax/video-01-director", + "status": "in_progress", + "created_at": 1729407438 +} +``` + +## Get Video Status + +### Request + +```python +status = client.videos.retrieve("019a0068-794a-7213-90f6-cc4eb62e3da7") +print(f"Status: {status.status}") +if status.status == "completed": + print(f"Video URL: {status.outputs.video_url}") + print(f"Cost: ${status.outputs.cost}") +``` + +```typescript +const status = await client.videos.retrieve("019a0068-794a-7213-90f6-cc4eb62e3da7"); +console.log(`Status: ${status.status}`); +if (status.status === "completed") { + console.log(`Video URL: ${status.outputs.video_url}`); + console.log(`Cost: $${status.outputs.cost}`); +} +``` + +```shell +curl -X GET "https://api.together.xyz/v2/videos/$JOB_ID" \ + -H "Authorization: Bearer $TOGETHER_API_KEY" +``` + +### Path Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `id` | string | Yes | Job ID from create response | + +### Response Schema + +| Field | Type | Description | +|-------|------|-------------| +| `id` | string | Unique job identifier | +| `object` | string | Always `"video"` | +| `model` | string | Model used | +| `status` | string | `in_progress`, `completed`, or `failed` | +| `created_at` | number | Unix timestamp of creation | +| `completed_at` | number | Unix timestamp of completion | +| `size` | string | Video resolution | +| `seconds` | string | Clip duration | +| `outputs` | object | `{cost, video_url}` when completed | +| `error` | object | `{code, message}` when failed | + +### Completed Response + +```json +{ + "id": "019a0068-794a-7213-90f6-cc4eb62e3da7", + "object": "video", + "model": "minimax/video-01-director", + "status": "completed", + "created_at": 1729407438, + "completed_at": 1729407612, + "size": "1366x768", + "seconds": "5", + "outputs": { + "cost": 0.28, + "video_url": "https://api.together.ai/shrt/DwlaBdSakNRFlBxN" + } +} +``` + +## Job Statuses + +| Status | Description | +|--------|-------------| +| `queued` | Waiting in queue | +| `in_progress` | Generating | +| `completed` | Done -- `outputs.video_url` available | +| `failed` | Check `error` for details | +| `cancelled` | Job cancelled | + +## Polling Pattern + +```python +import time +from together import Together +client = Together() + +job = client.videos.create( + prompt="A mountain landscape at sunset", + model="minimax/video-01-director", +) + +while True: + status = client.videos.retrieve(job.id) + if status.status == "completed": + print(f"Video: {status.outputs.video_url}") + break + elif status.status == "failed": + print(f"Error: {status.error}") + break + time.sleep(5) +``` + +## Guidance Scale + +| Range | Effect | +|-------|--------| +| 6.0-7.0 | More creative, less literal | +| 7.0-9.0 | Balanced (recommended) | +| 9.0-10.0 | Strict prompt adherence | +| >12.0 | Avoid -- causes artifacts | + +## Steps + +| Steps | Effect | +|-------|--------| +| 10 | Quick testing, lower quality | +| 20 | Standard quality | +| 30-40 | Production-grade | +| >50 | Diminishing returns | + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Prompt mismatch | Increase `guidance_scale` to 8-10 and use more specific language | +| Visual artifacts | Reduce `guidance_scale` below 12 and increase `steps` to 30-40 | +| Slow generation | Reduce `steps`, shorten `seconds`, or lower `fps` | +| URL expired | Download videos immediately after completion | +| Unnatural motion | Adjust `fps` and use `negative_prompt` to exclude unwanted artifacts | diff --git a/plugins/togetherai/skills/together-video/references/models.md b/plugins/togetherai/skills/together-video/references/models.md new file mode 100644 index 00000000..25dbaca4 --- /dev/null +++ b/plugins/togetherai/skills/together-video/references/models.md @@ -0,0 +1,62 @@ +# Video Generation Models Reference + +## Complete Model Table + +| Organization | Model | API String | Duration | Dimensions | FPS | Keyframes | +|-------------|-------|-----------|----------|-----------|-----|-----------| +| Google | Veo 3.0 | `google/veo-3.0` | 8s | 1280x720, 720x1280, 1920x1080, 1080x1920 | 24 | First | +| Google | Veo 3.0 + Audio | `google/veo-3.0-audio` | 8s | 1280x720, 720x1280, 1920x1080, 1080x1920 | 24 | First | +| Google | Veo 3.0 Fast | `google/veo-3.0-fast` | 8s | 1280x720, 720x1280, 1920x1080, 1080x1920 | 24 | First | +| Google | Veo 3.0 Fast + Audio | `google/veo-3.0-fast-audio` | 8s | 1280x720, 720x1280, 1920x1080, 1080x1920 | 24 | First | +| Google | Veo 2.0 | `google/veo-2.0` | 5s | 1280x720, 720x1280 | 24 | First, Last | +| OpenAI | Sora 2 | `openai/sora-2` | 8s | 1280x720, 720x1280 | - | First | +| OpenAI | Sora 2 Pro | `openai/sora-2-pro` | 8s | 1280x720, 720x1280 | - | First | +| MiniMax | Hailuo 02 | `minimax/hailuo-02` | 10s | 1366x768, 1920x1080 | 25 | First | +| MiniMax | Video 01 Director | `minimax/video-01-director` | 5s | 1366x768 | 25 | First | +| Kuaishou | Kling 2.1 Master | `kwaivgI/kling-2.1-master` | 5s | 1920x1080, 1080x1080, 1080x1920 | 24 | First | +| Kuaishou | Kling 2.1 Standard | `kwaivgI/kling-2.1-standard` | 5s | 1920x1080, 1080x1080, 1080x1920 | 24 | First | +| Kuaishou | Kling 2.1 Pro | `kwaivgI/kling-2.1-pro` | 5s | 1920x1080, 1080x1080, 1080x1920 | 24 | First, Last | +| Kuaishou | Kling 2.0 Master | `kwaivgI/kling-2.0-master` | 5s | 1280x720, 720x720, 720x1280 | 24 | First | +| Kuaishou | Kling 1.6 Standard | `kwaivgI/kling-1.6-standard` | 5s | 1920x1080, 1080x1080, 1080x1920 | 30, 24 | First | +| Kuaishou | Kling 1.6 Pro | `kwaivgI/kling-1.6-pro` | 5s | 1920x1080, 1080x1080, 1080x1920 | 24 | First | +| ByteDance | Seedance 1.0 Pro | `ByteDance/Seedance-1.0-pro` | 5s | Multiple (see below) | 24 | First, Last | +| ByteDance | Seedance 1.0 Lite | `ByteDance/Seedance-1.0-lite` | 5s | Multiple (see below) | 24 | First, Last | +| PixVerse | PixVerse v5 | `pixverse/pixverse-v5` | 5s | Multiple (see below) | 16, 24 | First, Last | +| Vidu | Vidu 2.0 | `vidu/vidu-2.0` | 8s | Multiple (see below) | 24 | First, Last | +| Vidu | Vidu Q1 | `vidu/vidu-q1` | 5s | 1920x1080, 1080x1080, 1080x1920 | 24 | First, Last | +| Wan-AI | Wan 2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B` | - | - | - | Text-to-Video | +| Wan-AI | Wan 2.2 I2V | `Wan-AI/Wan2.2-I2V-A14B` | - | - | - | Image-to-Video | + +## Seedance Dimensions + +864x480, 736x544, 640x640, 960x416, 416x960, 1248x704, 1120x832, 960x960, 1504x640, 640x1504 + +## PixVerse v5 Dimensions + +640x360, 480x360, 360x360, 270x360, 360x640, 960x540, 720x540, 540x540, 405x540, 540x960, +1280x720, 960x720, 720x720, 540x720, 720x1280, 1920x1080, 1440x1080, 1080x1080, 810x1080, +1080x1920 + +## Vidu 2.0 Dimensions + +1920x1080, 1080x1080, 1080x1920, 1280x720, 720x720, 720x1280, 640x360, 360x360, 360x640 + +## Feature Support + +| Feature | Models | +|---------|--------| +| Audio generation | Veo 3.0 + Audio, Veo 3.0 Fast + Audio | +| Reference images | Vidu 2.0 | +| First + Last keyframe | Veo 2.0, Kling 2.1 Pro, Seedance, PixVerse, Vidu | +| 10 second duration | Hailuo 02 | +| 1080p output | Veo 3.0, Seedance Pro, PixVerse, Kling 2.1, Vidu Q1, Sora 2 Pro | +| No prompt required | Kling 2.1 Standard/Pro, Kling 1.6 Pro | + +## Prompt Limits + +| Model | Prompt Length | +|-------|-------------| +| Most models | 2-3,000 characters | +| PixVerse v5 | 2-2,048 characters | +| Kling 2.1 Master | 2-2,500 characters | +| Sora 2 / 2 Pro | 1-4,000 characters | diff --git a/plugins/togetherai/skills/together-video/scripts/generate_video.py b/plugins/togetherai/skills/together-video/scripts/generate_video.py new file mode 100644 index 00000000..4f1a4578 --- /dev/null +++ b/plugins/togetherai/skills/together-video/scripts/generate_video.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +""" +Together AI Video Generation -- Text-to-Video with Polling (v2 SDK) + +Submit a video job, poll for completion, and download the result. +Demonstrates text-to-video, advanced parameters, and reference images. + +Usage: + python generate_video.py + +Requires: + uv pip install "together>=2.0.0" requests + export TOGETHER_API_KEY=your_key +""" + +import time + +import requests +from together import Together + +client = Together() + + +def wait_for_video(job_id: str, poll_interval: int = 5, timeout: int = 600) -> str: + """Poll a video job until completion. Returns the video URL.""" + elapsed = 0 + while elapsed < timeout: + status = client.videos.retrieve(job_id) + print(f" Status: {status.status} ({elapsed}s)") + + if status.status == "completed": + video_url = status.outputs.video_url + cost = status.outputs.cost + print(f" Video ready! Cost: ${cost}") + print(f" URL: {video_url}") + return video_url + elif status.status == "failed": + error = getattr(status, "error", None) + raise RuntimeError(f"Video generation failed: {error}") + + time.sleep(poll_interval) + elapsed += poll_interval + + raise TimeoutError(f"Video job {job_id} did not complete within {timeout}s") + + +def text_to_video( + prompt: str, + model: str = "minimax/video-01-director", + width: int = 1366, + height: int = 768, + **kwargs, +) -> str: + """Generate a video from a text prompt.""" + job = client.videos.create( + prompt=prompt, + model=model, + width=width, + height=height, + **kwargs, + ) + print(f"Submitted job: {job.id}") + return wait_for_video(job.id) + + +def text_to_video_advanced( + prompt: str, + model: str = "minimax/hailuo-02", +) -> str: + """Generate a video with advanced parameters.""" + job = client.videos.create( + prompt=prompt, + model=model, + width=1366, + height=768, + seconds="6", + fps=30, + steps=30, + guidance_scale=8.0, + output_format="MP4", + output_quality=20, + seed=42, + negative_prompt="blurry, low quality, distorted", + ) + print(f"Submitted job: {job.id}") + return wait_for_video(job.id) + + +def video_with_reference( + prompt: str, + reference_images: list[str], + model: str = "vidu/vidu-2.0", +) -> str: + """Generate a video guided by reference images (Vidu 2.0).""" + job = client.videos.create( + prompt=prompt, + model=model, + width=1280, + height=720, + reference_images=reference_images, + ) + print(f"Submitted job: {job.id}") + return wait_for_video(job.id) + + +if __name__ == "__main__": + # --- Example 1: Basic text-to-video --- + print("=== Basic Text-to-Video ===") + url = text_to_video( + prompt="A serene sunset over the ocean with gentle waves lapping at the shore", + ) + + # Download the video + response = requests.get(url) + with open("output.mp4", "wb") as f: + f.write(response.content) + print(f" Saved to output.mp4 ({len(response.content)} bytes)") + + # --- Example 2: Advanced parameters --- + # print("\n=== Advanced Parameters ===") + # text_to_video_advanced( + # prompt="A futuristic city at night with neon lights reflecting on wet streets", + # ) + + # --- Example 3: Reference images (Vidu 2.0) --- + # print("\n=== Reference Images ===") + # video_with_reference( + # prompt="A cat dancing energetically", + # reference_images=["https://cdn.pixabay.com/photo/2020/05/20/08/27/cat-5195431_1280.jpg"], + # ) diff --git a/plugins/togetherai/skills/together-video/scripts/generate_video.ts b/plugins/togetherai/skills/together-video/scripts/generate_video.ts new file mode 100644 index 00000000..a5f8d3b0 --- /dev/null +++ b/plugins/togetherai/skills/together-video/scripts/generate_video.ts @@ -0,0 +1,116 @@ +#!/usr/bin/env -S npx tsx +/** + * Together AI Video Generation -- Text-to-Video with Polling + * + * Submit a video job, poll for completion, and log the result. + * Demonstrates text-to-video, advanced parameters, and keyframes. + * + * Usage: + * npx tsx generate_video.ts + * + * Requires: + * npm install together-ai + * export TOGETHER_API_KEY=your_key + */ + +import { writeFileSync } from "fs"; +import Together from "together-ai"; + +const client = new Together({ + apiKey: process.env.TOGETHER_API_KEY, +}); + +async function waitForVideo( + jobId: string, + pollMs: number = 5000, + timeoutMs: number = 600_000, +): Promise { + const start = Date.now(); + while (Date.now() - start < timeoutMs) { + const status = await client.videos.retrieve(jobId); + const elapsed = Math.round((Date.now() - start) / 1000); + console.log(` Status: ${status.status} (${elapsed}s)`); + + if (status.status === "completed") { + const url = status.outputs.video_url; + console.log(` Video ready! Cost: $${status.outputs.cost}`); + console.log(` URL: ${url}`); + return url; + } + if (status.status === "failed") { + throw new Error(`Video generation failed: ${JSON.stringify(status.error)}`); + } + + await new Promise((r) => setTimeout(r, pollMs)); + } + throw new Error(`Video job ${jobId} did not complete within ${timeoutMs / 1000}s`); +} + +async function downloadVideo(url: string, outputPath: string): Promise { + const resp = await fetch(url); + if (!resp.ok) throw new Error(`Download failed: ${resp.status}`); + const buf = Buffer.from(await resp.arrayBuffer()); + writeFileSync(outputPath, buf); + console.log(` Saved to ${outputPath} (${buf.length} bytes)`); +} + +async function basicTextToVideo(): Promise { + console.log("=== Basic Text-to-Video ==="); + const job = await client.videos.create({ + prompt: "A serene sunset over the ocean with gentle waves", + model: "minimax/video-01-director", + width: 1366, + height: 768, + }); + console.log(`Job ID: ${job.id}`); + const url = await waitForVideo(job.id); + await downloadVideo(url, "output.mp4"); +} + +async function advancedParameters(): Promise { + console.log("\n=== Advanced Parameters ==="); + const job = await client.videos.create({ + prompt: "A futuristic city at night with neon lights reflecting on wet streets", + model: "minimax/hailuo-02", + width: 1366, + height: 768, + seconds: "6", + fps: 30, + steps: 30, + guidance_scale: 8.0, + output_format: "MP4", + output_quality: 20, + seed: 42, + negative_prompt: "blurry, low quality, distorted", + }); + console.log(`Job ID: ${job.id}`); + await waitForVideo(job.id); +} + +async function imageToVideo(): Promise { + console.log("\n=== Image-to-Video (Keyframe) ==="); + const job = await client.videos.create({ + prompt: "Smooth camera zoom out revealing a vast landscape", + model: "minimax/video-01-director", + width: 1366, + height: 768, + frame_images: [ + { + input_image: + "https://cdn.pixabay.com/photo/2020/05/20/08/27/cat-5195431_1280.jpg", + frame: "first", + }, + ], + }); + console.log(`Job ID: ${job.id}`); + await waitForVideo(job.id); +} + +async function main(): Promise { + await basicTextToVideo(); + // Uncomment to run additional examples: + // await advancedParameters(); + // await imageToVideo(); +} + +main(); diff --git a/plugins/togetherai/skills/together-video/scripts/image_to_video.py b/plugins/togetherai/skills/together-video/scripts/image_to_video.py new file mode 100644 index 00000000..79ad37b9 --- /dev/null +++ b/plugins/togetherai/skills/together-video/scripts/image_to_video.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +""" +Together AI Video -- Image-to-Video with Keyframe Control (v2 SDK) + +Animate images using keyframe control, poll until completion, and download the MP4. +Supports first frame, last frame, and first+last frame control depending on model. + +Usage: + python image_to_video.py [--prompt "..."] [--output promo.mp4] + +Requires: + uv pip install "together>=2.0.0" requests + export TOGETHER_API_KEY=your_key +""" + +import argparse +import base64 +import time +import requests as http_requests +from together import Together + +client = Together() + + +def wait_for_video(job_id: str, poll_interval: int = 5, timeout: int = 600) -> str: + """Poll a video job until completion. Returns the video URL.""" + elapsed = 0 + while elapsed < timeout: + status = client.videos.retrieve(job_id) + print(f" Status: {status.status} ({elapsed}s)") + + if status.status == "completed": + print(f" Video URL: {status.outputs.video_url}") + return status.outputs.video_url + elif status.status == "failed": + error = getattr(status, "error", None) + raise RuntimeError(f"Video generation failed: {error}") + + time.sleep(poll_interval) + elapsed += poll_interval + + raise TimeoutError(f"Video job {job_id} did not complete within {timeout}s") + + +def download_video(video_url: str, output_file: str) -> None: + """Download the completed video to a local file.""" + response = http_requests.get(video_url, timeout=120) + response.raise_for_status() + with open(output_file, "wb") as f: + f.write(response.content) + print(f"Saved to {output_file} ({len(response.content)} bytes)") + + +def image_to_video_url( + prompt: str, + image_url: str, + model: str = "minimax/video-01-director", + frame: str = "first", + width: int = 1366, + height: int = 768, + output_file: str = "promo.mp4", +) -> str: + """Animate an image using a URL (no base64 encoding needed).""" + job = client.videos.create( + prompt=prompt, + model=model, + width=width, + height=height, + frame_images=[{"input_image": image_url, "frame": frame}], + ) + print(f"Submitted job: {job.id}") + video_url = wait_for_video(job.id) + download_video(video_url, output_file) + return video_url + + +def image_to_video_base64( + prompt: str, + image_path: str, + model: str = "minimax/video-01-director", + frame: str = "first", + width: int = 1366, + height: int = 768, + output_file: str = "promo.mp4", +) -> str: + """Animate an image from a local file (base64-encoded).""" + with open(image_path, "rb") as f: + img_b64 = base64.b64encode(f.read()).decode("utf-8") + + job = client.videos.create( + prompt=prompt, + model=model, + width=width, + height=height, + frame_images=[{"input_image": img_b64, "frame": frame}], + ) + print(f"Submitted job: {job.id}") + video_url = wait_for_video(job.id) + download_video(video_url, output_file) + return video_url + + +def first_and_last_keyframes( + prompt: str, + first_image_url: str, + last_image_url: str, + model: str = "ByteDance/Seedance-1.0-pro", + width: int = 1248, + height: int = 704, +) -> str: + """Animate between two keyframes (first and last frame).""" + job = client.videos.create( + prompt=prompt, + model=model, + width=width, + height=height, + frame_images=[ + {"input_image": first_image_url, "frame": "first"}, + {"input_image": last_image_url, "frame": "last"}, + ], + ) + print(f"Submitted job: {job.id}") + return wait_for_video(job.id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Create a promo clip from a single image.") + parser.add_argument("image", help="Image URL or local file path") + parser.add_argument( + "--prompt", + default="Turn this single image into a 5-second promo clip with a slow cinematic camera move", + help="Video prompt", + ) + parser.add_argument("--output", default="promo.mp4", help="Where to save the downloaded MP4") + parser.add_argument("--model", default="minimax/video-01-director", help="Video model") + parser.add_argument("--frame", default="first", help="Keyframe position: first or last") + parser.add_argument("--width", type=int, default=1366, help="Output width") + parser.add_argument("--height", type=int, default=768, help="Output height") + args = parser.parse_args() + + if args.image.startswith(("http://", "https://")): + image_to_video_url( + prompt=args.prompt, + image_url=args.image, + model=args.model, + frame=args.frame, + width=args.width, + height=args.height, + output_file=args.output, + ) + else: + image_to_video_base64( + prompt=args.prompt, + image_path=args.image, + model=args.model, + frame=args.frame, + width=args.width, + height=args.height, + output_file=args.output, + ) From f54bf315e3e8c053c8d4da6eb986d6b6e1979cf3 Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Wed, 17 Jun 2026 19:27:09 -0700 Subject: [PATCH 2/2] fix: remove extra togetherai rule primitive --- plugins/togetherai/README.md | 2 +- plugins/togetherai/index.ts | 16 +--------------- plugins/togetherai/package.json | 3 +-- 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/plugins/togetherai/README.md b/plugins/togetherai/README.md index bb2d3167..60864ac1 100644 --- a/plugins/togetherai/README.md +++ b/plugins/togetherai/README.md @@ -8,7 +8,7 @@ This plugin bundles Together AI skills for chat completions, batch inference, em Each skill includes workflow guidance plus local reference files and example Python or TypeScript scripts. The plugin does not register an MCP server and does not run Together AI calls during install. -The plugin also adds a Together AI safety rule so Cline asks before running scripts, installing SDKs, spending credits, uploading data, creating or deleting endpoints, launching clusters, or using remote execution. +The bundled skills ask Cline to get approval before running scripts, installing SDKs, spending credits, uploading data, creating or deleting endpoints, launching clusters, or using remote execution. ## Install diff --git a/plugins/togetherai/index.ts b/plugins/togetherai/index.ts index 92e348bf..7bfe522d 100644 --- a/plugins/togetherai/index.ts +++ b/plugins/togetherai/index.ts @@ -1,23 +1,9 @@ import type { AgentPlugin } from "@cline/sdk" -const togetherAiRule = [ - "Together AI skills can create paid API calls, generated media, remote code runs, fine-tuning jobs, dedicated endpoints, containers, GPU clusters, and storage resources.", - "Do not run bundled scripts, install SDKs, submit jobs, create/delete infrastructure, upload training data or models, or spend API credits without explicit user approval.", - "Treat TOGETHER_API_KEY, external provider tokens, datasets, prompts, generated media URLs, model outputs, cluster credentials, and evaluation results as sensitive unless the user says otherwise.", - "Prefer read-only planning and local validation first. For destructive or cost-bearing workflows, state the target resource, expected cost/risk, and rollback or cleanup plan before proceeding.", -].join("\n") - const plugin: AgentPlugin = { name: "togetherai", manifest: { - capabilities: ["skills", "rules"], - }, - setup(api) { - api.registerRule({ - id: "togetherai-safety", - source: "togetherai", - content: togetherAiRule, - }) + capabilities: ["skills"], }, } diff --git a/plugins/togetherai/package.json b/plugins/togetherai/package.json index 1870a584..91dfe515 100644 --- a/plugins/togetherai/package.json +++ b/plugins/togetherai/package.json @@ -11,8 +11,7 @@ "./index.ts" ], "capabilities": [ - "skills", - "rules" + "skills" ] } ]