diff --git a/text_to_segment/.gitignore b/text_to_segment/.gitignore
new file mode 100644
index 0000000..b534a18
--- /dev/null
+++ b/text_to_segment/.gitignore
@@ -0,0 +1,4 @@
+*.mp4
+*.png
+*.jpg
+*.zip
diff --git a/text_to_segment/blending.py b/text_to_segment/blending.py
new file mode 100644
index 0000000..cae1da0
--- /dev/null
+++ b/text_to_segment/blending.py
@@ -0,0 +1,51 @@
+import sieve
+import cv2
+import numpy as np
+
+from utils import resize_and_crop
+
+
+def blend_to_background(object_video, mask_video, background_img):
+    """
+    superimpose `object_video` onto `background_img` using `mask_video`
+
+    assumes that `mask_video` frames correspond 1-1 with `object_video` frames
+    (but framerate doesn't matter)
+    """
+    object_video = cv2.VideoCapture(object_video.path)
+    mask_video = cv2.VideoCapture(mask_video.path)
+    background = cv2.imread(background_img.path)
+
+    output_path = "blended_output.mp4"
+
+    frame_width = int(object_video.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(object_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = object_video.get(cv2.CAP_PROP_FPS)
+
+    background = resize_and_crop(background, frame_width, frame_height)
+
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    output_video = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
+
+    while True:
+        ret_obj, obj_frame = object_video.read()
+        ret_mask, mask_frame = mask_video.read()
+
+        if not ret_obj or not ret_mask:
+            break
+
+        if len(mask_frame.shape) == 3:
+            mask_frame = cv2.cvtColor(mask_frame, cv2.COLOR_BGR2GRAY)
+
+        mask = mask_frame.astype(np.float32) / 255.0
+        mask = np.expand_dims(mask, axis=2)
+
+        blended_frame = (obj_frame * mask + background * (1 - mask)).astype(np.uint8)
+
+        output_video.write(blended_frame)
+
+    object_video.release()
+    mask_video.release()
+    output_video.release()
+
+    return sieve.File(path=output_path)
diff --git a/text_to_segment/galaxy.jpg b/text_to_segment/galaxy.jpg
new file mode 100644
index 0000000..40958a8
Binary files /dev/null and b/text_to_segment/galaxy.jpg differ
diff --git a/text_to_segment/main.py b/text_to_segment/main.py
new file mode 100644
index 0000000..eeabfa9
--- /dev/null
+++ b/text_to_segment/main.py
@@ -0,0 +1,136 @@
+import sieve
+import cv2
+import shutil
+import os
+import zipfile
+import tempfile
+import numpy as np
+
+from blending import blend_to_background
+from utils import (
+    get_first_frame,
+    zip_to_mp4,
+    splice_audio
+)
+
+
+def get_object_bbox(video: sieve.File, object_name: str):
+    yolo = sieve.function.get('sieve/yolov8')
+
+    frame = get_first_frame(video)
+
+    response = yolo.run(
+        file=frame,
+        classes=object_name,
+        models='yolov8l-world',
+    )
+
+    box = response['boxes'][0]
+
+    return box
+
+
+@sieve.function(
+    name="text-to-segment",
+    python_packages=["opencv-python"],
+    system_packages=[
+        "ffmpeg",
+        "libgl1-mesa-glx",
+        "libglib2.0-0"
+    ]
+)
+def segment(video: sieve.File, subject: str):
+    sam = sieve.function.get("sieve/sam2")
+
+    box = get_object_bbox(video, subject)
+
+    sam_prompt = {
+        "frame_index": 0,
+        "object_id": 1,
+        "box": [box['x1'],box['y1'],box['x2'],box['y2']]
+    }
+
+    debug, response = sam.run(
+        file=video,
+        prompts=[sam_prompt],
+        model_type="tiny",
+        pixel_confidences=True,
+        debug_masks=True,
+        bbox_tracking=True
+    )
+
+    return debug, response
+
+
+
+
+@sieve.function(
+    name="background-replace",
+    python_packages=["opencv-python"],
+    system_packages=[
+        "ffmpeg",
+        "libgl1-mesa-glx",
+        "libglib2.0-0"
+    ]
+)
+def background_replace(
+    video: sieve.File,
+    background: sieve.File,
+    subject: str,
+):
+
+    _, response = segment(video, subject)
+
+    mask_video = zip_to_mp4(response["confidences"])
+
+    blended_vid = blend_to_background(video, mask_video, background)
+
+    out = splice_audio(blended_vid, video)
+
+    return out
+
+
+if __name__ == "__main__":
+    # video_path = "trolley.mp4"
+
+    # video = sieve.File(path=video_path)
+    # debug, response = segment(video, "trolley")
+
+    # shutil.move(debug.path, "output.mp4")
+
+    # breakpoint()
+
+########################################
+
+    # mp4 = zip_to_mp4("masks.zip")
+    # shutil.move(mp4.path, "masks.mp4")
+
+########################################
+
+    # video_path = "trolley.mp4"
+    # mask_path = "confidence.mp4"
+    # bg = "galaxy.jpg"
+
+    # output_path = blend_to_background(video_path, mask_path, bg)
+    # shutil.move(output_path, "blended.mp4")
+
+########################################
+
+    # video_path = "trolley.mp4"
+    # bg_path = "galaxy.jpg"
+    # subject = "trolley"
+
+    video_path = "musk_fixed.mp4"
+    bg_path = "galaxy.jpg"
+    subject = "man"
+
+    video = sieve.File(path=video_path)
+    background = sieve.File(path=bg_path)
+
+    replaced = background_replace(video, background, subject)
+
+    shutil.move(replaced.path, f"{video_path.split('.')[0]}final_output.mp4")
+
+
+
+
diff --git a/text_to_segment/trolley.mp4 b/text_to_segment/trolley.mp4
new file mode 100644
index 0000000..7de569e
Binary files /dev/null and b/text_to_segment/trolley.mp4 differ
diff --git a/text_to_segment/utils.py b/text_to_segment/utils.py
new file mode 100644
index 0000000..de61740
--- /dev/null
+++ b/text_to_segment/utils.py
@@ -0,0 +1,90 @@
+import sieve
+import cv2
+import tempfile
+import os
+import zipfile
+
+
+def get_first_frame(video: sieve.File):
+    """
+    get the first frame of a video as a sieve.File
+    """
+    video_path = video.path
+
+    cap = cv2.VideoCapture(video_path)
+    ret, frame = cap.read()
+
+    if ret:
+        cv2.imwrite('first_frame.png', frame)
+    else:
+        raise Exception("Failed to read the video")
+
+    frame = sieve.File(path='first_frame.png')
+    cap.release()
+
+    return frame
+
+
+
+def zip_to_mp4(frames_zip: sieve.File):
+    """
+    convert zip file of frames to an mp4
+    """
+    output_path = "output_video.mp4"
+    with tempfile.TemporaryDirectory() as temp_dir:
+        with zipfile.ZipFile(frames_zip.path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+
+        images = [img for img in os.listdir(temp_dir) if img.endswith(".png")]
+        images = sorted(images, key=lambda x: int(x.split('_')[1]))
+
+        first_frame = cv2.imread(os.path.join(temp_dir, images[0]))
+        height, width, layers = first_frame.shape
+        frame_size = (width, height)
+
+        # Define the codec and create VideoWriter object
+        out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), 30, frame_size)
+
+        # Loop through the images and write them to the video
+        for image in images:
+            img_path = os.path.join(temp_dir, image)
+            frame = cv2.imread(img_path)
+            out.write(frame)
+
+    out.release()
+    return sieve.File(path=output_path)
+
+
+def resize_and_crop(image, target_width, target_height):
+    """
+    resize image to meet target_height, target_width without stretching
+    """
+
+    image_height, image_width = image.shape[:2]
+
+    target_aspect = target_width / target_height
+    image_aspect = image_width / image_height
+
+    if image_aspect > target_aspect:
+        new_height = target_height
+        new_width = int(image_aspect * new_height)
+    else:
+        new_width = target_width
+        new_height = int(new_width / image_aspect)
+
+    resized_image = cv2.resize(image, (new_width, new_height))
+
+    crop_x = (new_width - target_width) // 2
+    crop_y = (new_height - target_height) // 2
+
+    cropped_image = resized_image[crop_y:crop_y + target_height, crop_x:crop_x + target_width]
+
+    return cropped_image
+
+
+def splice_audio(video, audio):
+    spliced_path = "spliced.mp4"
+    cmd = f"ffmpeg -y -nostdin -loglevel error -i {video.path} -i {audio.path} -c:v copy -c:a aac {spliced_path}"
+    os.system(cmd)
+
+    return sieve.File(path=spliced_path)