Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 48 additions & 3 deletions mstar/api_server/data_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from mstar.communication.tensors import NameToTensorList, create_tensor_communication_manager
from mstar.model.base import Model
from mstar.utils.ipc_format import (
AbortRequest,
ConductorMessage,
ConductorMessageType,
NewRequestConductor,
Expand Down Expand Up @@ -51,6 +52,8 @@ def __init__(
self.request_input_queue = queue.Queue()
self.result_tensor_input_queue = queue.Queue()
self.cleanup_request_queue = queue.Queue()
self.abort_request_queue = queue.Queue()
self.discard_tensor_queue = queue.Queue()
self.output_queue = queue.Queue()
self.stop_event = threading.Event()

Expand All @@ -64,6 +67,8 @@ def __init__(
result_tensor_queue=self.result_tensor_input_queue,
out_queue=self.output_queue,
cleanup_request_queue=self.cleanup_request_queue,
abort_request_queue=self.abort_request_queue,
discard_tensor_queue=self.discard_tensor_queue,
stop_event=self.stop_event,
hostname=hostname,
socket_path_prefix=socket_path_prefix,
Expand All @@ -79,10 +84,16 @@ def new_request(self, input: PreprocessInput):
self.per_request_reading_tensors[input.request_id] = 0
self.request_input_queue.put(input)

def abort_request(self, request_id: str):
self.abort_request_queue.put(request_id)

def new_result_tensors(self, input: ResultTensors):
name = input.graph_edge.name
if input.request_id not in self.output_loop_idxs:
logger.debug("Late result_tensors for cleaned-up request %s, ignoring", input.request_id)
# Request was removed while this output was still in flight; ack the
# tensors so the producing worker can reclaim them rather than leak.
logger.debug("Late result_tensors for cleaned-up request %s, acking and dropping", input.request_id)
self.discard_result_tensors(input)
return

self.output_loop_idxs[input.request_id][name] = input.loop_indices.max(
Expand All @@ -96,6 +107,14 @@ def new_result_tensors(self, input: ResultTensors):
)
self.result_tensor_input_queue.put(input)

def discard_result_tensors(self, input: ResultTensors):
"""Ack and drop result tensors for an already-removed request.

Routed to the worker thread (which owns the communicator) so the
producing worker gets its TENSOR_RECEIVED ack and frees the buffers.
"""
self.discard_tensor_queue.put(input)

def has_pending_tensors(self, request_id: str):
return self.per_request_reading_tensors.get(request_id, 0) > 0

Expand Down Expand Up @@ -123,8 +142,8 @@ def get_result_chunks(self)-> list[ResultChunk]:

def cleanup_request(self, request_id: str):
self.cleanup_request_queue.put(request_id)
del self.output_loop_idxs[request_id]
del self.per_request_reading_tensors[request_id]
self.output_loop_idxs.pop(request_id, None)
self.per_request_reading_tensors.pop(request_id, None)

def shutdown(self):
self.stop_event.set()
Expand All @@ -139,6 +158,8 @@ def __init__(
result_tensor_queue: queue.Queue, # for output streaming
out_queue: queue.Queue,
cleanup_request_queue: queue.Queue,
abort_request_queue: queue.Queue,
discard_tensor_queue: queue.Queue,
stop_event: threading.Event,
hostname: str = "localhost",
socket_path_prefix: str = "/tmp/mstar",
Expand All @@ -150,6 +171,8 @@ def __init__(
self.in_queue = in_queue
self.result_tensor_queue = result_tensor_queue
self.cleanup_request_queue = cleanup_request_queue
self.abort_request_queue = abort_request_queue
self.discard_tensor_queue = discard_tensor_queue
self.out_queue = out_queue

self.stop_event = stop_event
Expand Down Expand Up @@ -275,6 +298,16 @@ def _read_result_tensor(
self.tensor_uuid_to_metadata_per_request[result.request_id][
tensor_info.uuid] = result.metadata

def _discard_result_tensor(
self, result: ResultTensors
):
# The request is gone, so don't start a read — just ack the tensors back
# to the producing worker so it can free the source buffers.
self.tensor_manager.ack_unread_tensors(
request_id=result.request_id,
graph_edges=[result.graph_edge],
)

def _process_read_tensors(self):
did_work = False
for request_id, graph_edges in self.tensor_manager.get_ready_tensors().items():
Expand Down Expand Up @@ -348,6 +381,18 @@ def run(self):
if not self.result_tensor_queue.empty():
did_work = True
self._read_result_tensor(self.result_tensor_queue.get())
if not self.abort_request_queue.empty():
did_work = True
self.communicator.send(
"conductor",
ConductorMessage(
message_type=ConductorMessageType.ABORT_REQUEST,
body=AbortRequest(request_id=self.abort_request_queue.get()),
),
)
if not self.discard_tensor_queue.empty():
did_work = True
self._discard_result_tensor(self.discard_tensor_queue.get())
if not self.cleanup_request_queue.empty():
did_work = True
req_id = self.cleanup_request_queue.get()
Expand Down
128 changes: 77 additions & 51 deletions mstar/api_server/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from typing import Optional

import uvicorn
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from starlette.concurrency import run_in_threadpool
Expand Down Expand Up @@ -325,10 +325,14 @@ def _process_messages(self) -> None:
message.body.final_outputs
elif rid in self.recently_completed:
logger.debug("Late message for completed %s: %s", rid, message.message_type)
if message.message_type == "result_tensors":
self.preprocess_worker.discard_result_tensors(message.body)
else:
logger.warning(
"Message for unknown request %s: %s", rid, message.message_type
)
if message.message_type == "result_tensors":
self.preprocess_worker.discard_result_tensors(message.body)
for result_chunk in self.preprocess_worker.get_result_chunks():
logger.debug(
"Got result chunk of %s modality for request %s",
Expand Down Expand Up @@ -360,42 +364,46 @@ async def iter_result_chunks(self, request_id: str):
pre-serialized line).
"""
start = time.time()
while True:
if time.time() - start > self.timeout_seconds:
with self.request_lock:
self.pending_requests.pop(request_id, None)
raise HTTPException(status_code=500, detail="Request timed out")

new_chunks: list[ResultChunk] = []
done = False
with self.request_lock:
req = self.pending_requests.get(request_id)
if req:
avail = len(req["chunks"])
consumed = req["consumed_chunks"]
new_chunks = req["chunks"][consumed:avail]
req["consumed_chunks"] = avail
done = req["event"].is_set()
else:
done = True

for chunk in new_chunks:
yield chunk

if done:
logger.info("Async stream results received finish for %s", request_id)
# flush remaining
remaining: list[ResultChunk] = []
finished = False
try:
while True:
if time.time() - start > self.timeout_seconds:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably for another PR (I can raise an issue), but I think it could be worthwhile to be able to raise/lower timeout_seconds per request? e.g., to fail fast if we know the request should be short, or to bump it up for a longer request. Thoughts?

raise HTTPException(status_code=500, detail="Request timed out")

new_chunks: list[ResultChunk] = []
done = False
with self.request_lock:
req = self.pending_requests.get(request_id)
if req:
remaining = req["chunks"][req["consumed_chunks"]:]
self.pending_requests.pop(request_id, None)
for chunk in remaining:
avail = len(req["chunks"])
consumed = req["consumed_chunks"]
new_chunks = req["chunks"][consumed:avail]
req["consumed_chunks"] = avail
done = req["event"].is_set()
else:
done = True

for chunk in new_chunks:
yield chunk
break

await asyncio.sleep(0.001)
if done:
logger.info("Async stream results received finish for %s", request_id)
# flush remaining
remaining: list[ResultChunk] = []
with self.request_lock:
req = self.pending_requests.get(request_id)
if req:
remaining = req["chunks"][req["consumed_chunks"]:]
self.pending_requests.pop(request_id, None)
for chunk in remaining:
yield chunk
finished = True
break

await asyncio.sleep(0.001)
finally:
if not finished:
self.abort_request(request_id)

async def async_stream_results(self, request_id: str):
"""Yield NDJSON lines as result chunks arrive (``/generate`` format)."""
Expand All @@ -411,28 +419,47 @@ def _chunk_to_ndjson(chunk: ResultChunk) -> str:
}) + "\n"

# ----------------------------------------------------------
# Blocking helper (non-streaming)
# Non-streaming helper
# ----------------------------------------------------------

def collect_results(self, request_id: str) -> list[ResultChunk]:
"""Block until the request completes, then return all chunks."""
with self.request_lock:
req = self.pending_requests.get(request_id)
if not req:
raise HTTPException(
status_code=404, detail=f"Request {request_id} not found"
)
event = req["event"]

if not event.wait(timeout=self.timeout_seconds):
async def collect_results(
self, request_id: str, raw_request: Request | None = None
) -> list[ResultChunk]:
"""Wait for the request to finish (or the client to disconnect), then
return its chunks. Disconnecting or timing out releases engine state."""
start = time.time()
while True:
with self.request_lock:
self.pending_requests.pop(request_id, None)
raise HTTPException(status_code=500, detail="Request timed out")
req = self.pending_requests.get(request_id)
done = req["event"].is_set() if req else True
if done:
break
if time.time() - start > self.timeout_seconds:
self.abort_request(request_id)
raise HTTPException(status_code=500, detail="Request timed out")
if raw_request is not None and await raw_request.is_disconnected():
self.abort_request(request_id)
return []
await asyncio.sleep(0.005)

with self.request_lock:
chunks = self.pending_requests[request_id]["chunks"][:]
req = self.pending_requests.pop(request_id, None)
return list(req["chunks"]) if req else []

def abort_request(self, request_id: str) -> None:
"""Stop GPU work for a request the client abandoned and drop its state."""
with self.request_lock:
active = (
request_id in self.pending_requests
or request_id in self.recently_completed
)
self.pending_requests.pop(request_id, None)
return chunks
self.recently_completed.pop(request_id, None)
if not active:
return
logger.info("Client cancelled request %s; releasing resources", request_id)
self.preprocess_worker.abort_request(request_id)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: is it cleaner to have preprocess_worker.abort_request also do the cleanup path, instead of the caller having to remember to call both?

self.preprocess_worker.cleanup_request(request_id)

# ----------------------------------------------------------
# Cleanup
Expand Down Expand Up @@ -472,6 +499,7 @@ def cleanup(self) -> None:

@app.post("/generate")
async def generate(
request: Request,
text: Optional[str] = Form(None),
files: Optional[list[UploadFile]] = File(None),
input_modalities: Optional[str] = Form(None),
Expand Down Expand Up @@ -547,9 +575,7 @@ async def generate(
headers={"Cache-Control": "no-cache"},
)

chunks = await run_in_threadpool(
api_server.collect_results, request_id
)
chunks = await api_server.collect_results(request_id, request)
outputs: dict[str, list[dict]] = {}
for chunk in chunks:
outputs.setdefault(chunk.modality, []).append({
Expand Down
13 changes: 7 additions & 6 deletions mstar/api_server/openai/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ async def list_models():


@router.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
async def chat_completions(request: ChatCompletionRequest, raw_request: Request):
api, model_name, adapter, err = _resolve("supports_chat")
if err is not None:
return err
try:
result = await serving_chat.create_chat_completion(api, model_name, adapter, request)
result = await serving_chat.create_chat_completion(api, model_name, adapter, request, raw_request)
except Exception as e: # noqa: BLE001 — surface as an OpenAI error envelope
return _error(getattr(e, "status_code", 500), str(getattr(e, "detail", e)), "server_error")
if request.stream:
Expand All @@ -91,23 +91,23 @@ async def chat_completions(request: ChatCompletionRequest):


@router.post("/v1/audio/speech")
async def audio_speech(request: SpeechRequest):
async def audio_speech(request: SpeechRequest, raw_request: Request):
api, model_name, adapter, err = _resolve("supports_speech")
if err is not None:
return err
try:
return await serving_speech.create_speech(api, model_name, adapter, request)
return await serving_speech.create_speech(api, model_name, adapter, request, raw_request)
except Exception as e: # noqa: BLE001
return _error(getattr(e, "status_code", 500), str(getattr(e, "detail", e)), "server_error")


@router.post("/v1/images/generations")
async def images_generations(request: ImageGenerationRequest):
async def images_generations(request: ImageGenerationRequest, raw_request: Request):
api, model_name, adapter, err = _resolve("supports_images")
if err is not None:
return err
try:
result = await serving_images.create_images(api, model_name, adapter, request)
result = await serving_images.create_images(api, model_name, adapter, request, raw_request)
except Exception as e: # noqa: BLE001
return _error(getattr(e, "status_code", 500), str(getattr(e, "detail", e)), "server_error")
return JSONResponse(result)
Expand Down Expand Up @@ -144,6 +144,7 @@ async def images_edits(request: Request):
image_bytes=image_bytes,
image_filename=getattr(image, "filename", None),
model_kwargs=extra,
raw_request=request,
)
except Exception as e: # noqa: BLE001
return _error(getattr(e, "status_code", 500), str(getattr(e, "detail", e)), "server_error")
Expand Down
6 changes: 2 additions & 4 deletions mstar/api_server/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@

import base64

from starlette.concurrency import run_in_threadpool

from mstar.api_server import media_io
from mstar.api_server.openai._util import SSE_DONE, now, rid, sse


async def create_chat_completion(api, model_name, adapter, req):
async def create_chat_completion(api, model_name, adapter, req, raw_request=None):
args = adapter.chat_to_request(req, api.upload_dir)
request_id = rid("chatcmpl")
sample_rate = api.model.get_output_sample_rate("audio") if api.model is not None else 24000
Expand All @@ -33,7 +31,7 @@ async def create_chat_completion(api, model_name, adapter, req):

if req.stream:
return _stream(api, model_name, request_id, sample_rate)
chunks = await run_in_threadpool(api.collect_results, request_id)
chunks = await api.collect_results(request_id, raw_request)
return _build_response(model_name, request_id, chunks, sample_rate)


Expand Down
Loading
Loading