From 148c660c3c943f908e35d3727555e8d89a53db7c Mon Sep 17 00:00:00 2001
From: Goncalves Braz <l.b.goncalvesbraz@uu.nl>
Date: Sun, 22 Feb 2026 23:43:04 +0100
Subject: [PATCH 1/4] Add SayCan language-conditioned manipulation

Introduce saycan experiment package for language-conditioned robotic manipulation. Adds README and modules to integrate CLIPort (TransporterNet + CLIP), ViLD object detection, an Ollama-backed LLM interface, PyBullet UR5e environment, dataset utilities, and helper functions. Includes asset downloader and configuration (config.py), a CLIPort wrapper with checkpoint loading/migration logic (cliport.py), a SHARPIE-friendly environment wrapper that combines LLM planning, affordance scoring and CLIPort execution (environment.py), dataset collection code (datasets.py), and utility helpers (helpers.py).
---
 saycan/README.md         | 105 +++++++
 saycan/cliport.py        | 563 +++++++++++++++++++++++++++++++++++
 saycan/config.py         | 143 +++++++++
 saycan/datasets.py       |  60 ++++
 saycan/environment.py    | 310 +++++++++++++++++++
 saycan/helpers.py        | 111 +++++++
 saycan/llm.py            | 170 +++++++++++
 saycan/pick_place_env.py | 487 ++++++++++++++++++++++++++++++
 saycan/policy.py         |  61 ++++
 saycan/requirements.txt  |  41 +++
 saycan/robot.py          | 161 ++++++++++
 saycan/vild.py           | 624 +++++++++++++++++++++++++++++++++++++++
 12 files changed, 2836 insertions(+)
 create mode 100644 saycan/README.md
 create mode 100644 saycan/cliport.py
 create mode 100644 saycan/config.py
 create mode 100644 saycan/datasets.py
 create mode 100644 saycan/environment.py
 create mode 100644 saycan/helpers.py
 create mode 100644 saycan/llm.py
 create mode 100644 saycan/pick_place_env.py
 create mode 100644 saycan/policy.py
 create mode 100644 saycan/requirements.txt
 create mode 100644 saycan/robot.py
 create mode 100644 saycan/vild.py
diff --git a/saycan/README.md b/saycan/README.md
new file mode 100644
index 0000000..3911187
--- /dev/null
+++ b/saycan/README.md
@@ -0,0 +1,105 @@
+# SayCan - Language-Conditioned Robotic Manipulation
+
+This experiment implements the SayCan approach for grounding language in robotic affordances,
+combining:
+- **ViLD**: Open-vocabulary object detection
+- **LLM (Ollama)**: Task planning and action scoring
+- **CLIPort**: Language-conditioned pick-and-place manipulation
+- **PyBullet**: Physics simulation with UR5e robot arm
+
+## Installation
+
+### Core Dependencies
+```bash
+pip install -r requirements.txt
+```
+
+### Ollama (for LLM)
+Install Ollama from [ollama.ai](https://ollama.ai) and pull a model:
+```bash
+ollama pull llama3.2:1b
+```
+
+### Asset Downloads
+Assets (robot URDFs, ViLD model, CLIPort checkpoint) are downloaded automatically on first run.
+
+## Setting on the Webserver
+
+<!-- Add this environment to the database -->
+```bash
+python manage.py shell -c "from experiment.models import Environment; Environment.objects.update_or_create(name='SayCan', defaults={
+    'description': 'Language-conditioned robotic manipulation with LLM planning',
+    'filepaths': {'environment': 'saycan/environment.py'}
+})"
+```
+
+<!-- Add this experiment to the database -->
+```bash
+python manage.py shell -c "from experiment.models import Experiment, Environment; Experiment.objects.update_or_create(link='saycan', defaults={
+    'name': 'SayCan',
+    'short_description': 'Robot manipulation with natural language instructions',
+    'long_description': 'Guide a robot arm to pick and place objects using natural language instructions. The system uses ViLD for object detection, an LLM for task planning, and CLIPort for language-conditioned manipulation.\r\n\n<br>\n<br>\nYou can give instructions like:\n<ul>\n<li>\"task: put all blocks in bowls\" - Set a high-level task</li>\n<li>\"pick the blue block and place it on the red bowl\" - Direct instruction</li>\n</ul>',
+    'enabled': True,
+    'environment': Environment.objects.get(name='SayCan'),
+    'number_of_episodes': 1,
+    'target_fps': 24.0,
+    'wait_for_inputs': True
+})"
+```
+
+<!-- Add this policy to the database -->
+```bash
+python manage.py shell -c "from experiment.models import Policy; Policy.objects.update_or_create(name='SayCan', defaults={
+    'description': 'SayCan policy with LLM planning and CLIPort execution',
+    'filepaths': {'policy': 'saycan/policy.py'},
+    'checkpoint_interval': 0
+})"
+```
+
+<!-- Add this agent to the database -->
+```bash
+python manage.py shell -c "from experiment.models import Agent, Policy; Agent.objects.update_or_create(role='agent_0', defaults={
+    'name': 'Robot',
+    'description': 'UR5e robot arm with Robotiq gripper',
+    'policy': Policy.objects.get(name='SayCan'),
+    'participant': True,
+    'keyboard_inputs': {},
+    'multiple_keyboard_inputs': False,
+    'inputs_type': 'actions',
+    'textual_inputs': True
+})"
+```
+
+<!-- Link agent to experiment -->
+```bash
+python manage.py shell -c "from experiment.models import Experiment, Agent; exp = Experiment.objects.get(link='saycan'); exp.agents.add(Agent.objects.get(role='agent_0'))"
+```
+
+## Usage
+
+### Action Types
+The environment accepts the following action types:
+
+| Action | Description |
+|--------|-------------|
+| `"task:<description>"` | Set a high-level task for LLM planning |
+| `"plan"` | Get next planned action from LLM |
+| `"<text instruction>"` | Direct pick-and-place instruction |
+| `"done"` | End the episode |
+
+### Example Tasks
+- `task: put all blocks in bowls`
+- `task: stack the blocks`
+- `task: sort blocks by color`
+- `pick the blue block and place it on the red bowl`
+
+## References
+
+- **SayCan**: [Ahn et al. (2022) - Do As I Can, Not As I Say](https://arxiv.org/abs/2204.01691)
+- **CLIPort**: [Shridhar et al. (2021) - What and Where Pathways for Robotic Manipulation](https://arxiv.org/abs/2109.12098)
+- **ViLD**: [Gu et al. (2021) - Open-Vocabulary Object Detection via Vision and Language Knowledge Distillation](https://arxiv.org/abs/2104.13921)
+
+## Repository
+
+- Original SayCan: https://github.com/google-research/google-research/tree/master/saycan
+- CLIPort: https://github.com/cliport/cliport
\ No newline at end of file
diff --git a/saycan/cliport.py b/saycan/cliport.py
new file mode 100644
index 0000000..c7bec74
--- /dev/null
+++ b/saycan/cliport.py
@@ -0,0 +1,563 @@
+"""
+CLIPort - CLIP + Transporter Networks for Language-Conditioned Manipulation.
+
+This module implements the CLIPort architecture for language-conditioned pick-and-place
+operations. It combines CLIP (Contrastive Language-Image Pre-training) with Transporter
+Networks to predict pick and place positions based on natural language instructions.
+
+Key Components:
+- ResNet-based encoder-decoder architecture
+- CLIP text and image encoders
+- Transporter Networks for pick and place heatmap prediction
+- Pretrained checkpoint loading
+
+CLIPort Repository:
+    https://github.com/cliport/cliport
+
+Reference:
+    Shridhar, M., Manuelli, L., & Fox, D. (2021). CLIPort: What and Where Pathways
+    for Robotic Manipulation. Conference on Robot Learning (CoRL).
+
+Used in SayCan:
+    https://github.com/google-research/google-research/tree/master/saycan
+"""
+
+import os
+import subprocess
+import numpy as np
+import torch
+import clip
+import matplotlib.pyplot as plt
+import jax
+import jax.numpy as jnp
+import optax
+import flax
+from flax import linen as nn
+from flax.training import checkpoints
+from moviepy import ImageSequenceClip
+from IPython.display import display
+
+# Get the saycan directory for checkpoint paths
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+
+class ResNetBlock(nn.Module):
+  """ResNet pre-Activation block. https://arxiv.org/pdf/1603.05027.pdf"""
+  features: int
+  stride: int = 1
+
+  def setup(self):
+    self.conv0 = nn.Conv(self.features // 4, (1, 1), (self.stride, self.stride))
+    self.conv1 = nn.Conv(self.features // 4, (3, 3))
+    self.conv2 = nn.Conv(self.features, (1, 1))
+    self.conv3 = nn.Conv(self.features, (1, 1), (self.stride, self.stride))
+
+  def __call__(self, x):
+    y = self.conv0(nn.relu(x))
+    y = self.conv1(nn.relu(y))
+    y = self.conv2(nn.relu(y))
+    if x.shape != y.shape:
+      x = self.conv3(nn.relu(x))
+    return x + y
+
+
+class UpSample(nn.Module):
+  """Simple 2D 2x bilinear upsample."""
+
+  def __call__(self, x):
+    B, H, W, C = x.shape
+    new_shape = (B, H * 2, W * 2, C)
+    return jax.image.resize(x, new_shape, 'bilinear')
+
+
+class ResNet(nn.Module):
+  """Hourglass 53-layer ResNet with 8-stride."""
+  out_dim: int
+
+  def setup(self):
+    self.dense0 = nn.Dense(8)
+
+    self.conv0 = nn.Conv(64, (3, 3), (1, 1))
+    self.block0 = ResNetBlock(64)
+    self.block1 = ResNetBlock(64)
+    self.block2 = ResNetBlock(128, stride=2)
+    self.block3 = ResNetBlock(128)
+    self.block4 = ResNetBlock(256, stride=2)
+    self.block5 = ResNetBlock(256)
+    self.block6 = ResNetBlock(512, stride=2)
+    self.block7 = ResNetBlock(512)
+
+    self.block8 = ResNetBlock(256)
+    self.block9 = ResNetBlock(256)
+    self.upsample0 = UpSample()
+    self.block10 = ResNetBlock(128)
+    self.block11 = ResNetBlock(128)
+    self.upsample1 = UpSample()
+    self.block12 = ResNetBlock(64)
+    self.block13 = ResNetBlock(64)
+    self.upsample2 = UpSample()
+    self.block14 = ResNetBlock(16)
+    self.block15 = ResNetBlock(16)
+    self.conv1 = nn.Conv(self.out_dim, (3, 3), (1, 1))
+
+  def __call__(self, x, text):
+
+    # # Project and concatenate CLIP features (early fusion).
+    # text = self.dense0(text)
+    # text = jnp.expand_dims(text, axis=(1, 2))
+    # text = jnp.broadcast_to(text, x.shape[:3] + (8,))
+    # x = jnp.concatenate((x, text), axis=-1)
+
+    x = self.conv0(x)
+    x = self.block0(x)
+    x = self.block1(x)
+    x = self.block2(x)
+    x = self.block3(x)
+    x = self.block4(x)
+    x = self.block5(x)
+    x = self.block6(x)
+    x = self.block7(x)
+
+    # Concatenate CLIP features (mid-fusion).
+    text = jnp.expand_dims(text, axis=(1, 2))
+    text = jnp.broadcast_to(text, x.shape)
+    x = jnp.concatenate((x, text), axis=-1)
+
+    x = self.block8(x)
+    x = self.block9(x)
+    x = self.upsample0(x)
+    x = self.block10(x)
+    x = self.block11(x)
+    x = self.upsample1(x)
+    x = self.block12(x)
+    x = self.block13(x)
+    x = self.upsample2(x)
+    x = self.block14(x)
+    x = self.block15(x)
+    x = self.conv1(x)
+    return x
+
+
+class TransporterNets(nn.Module):
+  """TransporterNet with 3 ResNets (translation only)."""
+
+  def setup(self):
+    # Picking affordances.
+    self.pick_net = ResNet(1)
+
+    # Pick-conditioned placing affordances.
+    self.q_net = ResNet(3)  # Query (crop around pick location).
+    self.k_net = ResNet(3)  # Key (place features).
+    self.crop_size = 64
+    self.crop_conv = nn.Conv(features=1, kernel_size=(self.crop_size, self.crop_size), use_bias=False, dtype=jnp.float32, padding='SAME')
+
+  def __call__(self, x, text, p=None, train=True):
+    B, H, W, C = x.shape
+    pick_out = self.pick_net(x, text)  # (B, H, W, 1)
+
+    # Get key features.
+    k = self.k_net(x, text)
+
+    # Add 0-padding before cropping.
+    h = self.crop_size // 2
+    x_crop = jnp.pad(x, [(0, 0), (h, h), (h, h), (0, 0)], 'maximum')
+
+    # Get query features and convolve them over key features.
+    place_out = jnp.zeros((0, H, W, 1), jnp.float32)
+    for b in range(B):
+
+      # Get coordinates at center of crop.
+      if p is None:
+        pick_out_b = pick_out[b, ...]  # (H, W, 1)
+        pick_out_b = pick_out_b.flatten()  # (H * W,)
+        amax_i = jnp.argmax(pick_out_b)
+        v, u = jnp.unravel_index(amax_i, (H, W))
+      else:
+        v, u = p[b, :]
+
+      # Get query crop.
+      x_crop_b = jax.lax.dynamic_slice(x_crop, (b, v, u, 0), (1, self.crop_size, self.crop_size, x_crop.shape[3]))
+      # x_crop_b = x_crop[b:b+1, v:(v + self.crop_size), u:(u + self.crop_size), ...]
+
+      # Convolve q (query) across k (key).
+      q = self.q_net(x_crop_b, text[b:b+1, :])  # (1, H, W, 3)
+      q = jnp.transpose(q, (1, 2, 3, 0))  # (H, W, 3, 1)
+      place_out_b = self.crop_conv.apply({'params': {'kernel': q}}, k[b:b+1, ...])  # (1, H, W, 1)
+      scale = 1 / (self.crop_size * self.crop_size)  # For higher softmax temperatures.
+      place_out_b *= scale
+      place_out = jnp.concatenate((place_out, place_out_b), axis=0)
+
+    return pick_out, place_out
+
+
+def n_params(params):
+  return jnp.sum(jnp.int32([n_params(v) if isinstance(v, dict) or isinstance(v, flax.core.frozen_dict.FrozenDict) else np.prod(v.shape) for v in params.values()]))
+
+from flax.training import train_state
+
+class TrainState(train_state.TrainState):
+  pass
+
+
+
+
+#@markdown Train your own model, or load a pretrained one.
+load_pretrained = True  #@param {type:"boolean"}
+
+# Initialize model weights using dummy tensors.
+rng = jax.random.PRNGKey(0)
+rng, key = jax.random.split(rng)
+init_img = jnp.ones((4, 224, 224, 5), jnp.float32)
+init_text = jnp.ones((4, 512), jnp.float32)
+init_pix = jnp.zeros((4, 2), np.int32)
+init_params = TransporterNets().init(key, init_img, init_text, init_pix)['params']
+print(f'Model parameters: {n_params(init_params):,}')
+
+# Define the Optax optimizer
+optimizer_tx = optax.adam(learning_rate=1e-4)
+
+# Create an initial TrainState object. This will have step=0.
+optim = TrainState.create(apply_fn=TransporterNets().apply,
+                          params=init_params,
+                          tx=optimizer_tx)
+
+if load_pretrained:
+  ckpt_path = os.path.join(SAYCAN_DIR, f'ckpt_{40000}')
+  if not os.path.exists(ckpt_path):
+    import subprocess
+    print("Downloading CLIPort checkpoint...")
+    subprocess.run(['gdown', '--id', '1Nq0q1KbqHOA5O7aRSu4u7-u27EMMXqgP', '-O', ckpt_path], check=False)
+
+  try:
+    # Attempt to restore directly. This will fail if 'step' is missing in the checkpoint.
+    optim = checkpoints.restore_checkpoint(ckpt_path, optim)
+    print('Loaded:', ckpt_path)
+  except ValueError as e:
+    if "Missing field step in state dict" in str(e):
+      print("Attempting to load old checkpoint format (missing 'step' field).")
+      # Load the raw checkpoint data as a dictionary
+      loaded_state_dict = checkpoints.restore_checkpoint(ckpt_path, target=None)
+
+      if isinstance(loaded_state_dict, dict):
+        # Extract parameters, common keys for parameters are 'params' or 'target'
+        params_from_ckpt = loaded_state_dict.get('params', loaded_state_dict.get('target', init_params))
+        
+        # Re-initialize the opt_state using the current optax optimizer with loaded parameters.
+        # This means the exact state of the old optimizer might be lost if it was not optax-compatible,
+        # but model parameters are preserved.
+        new_opt_state = optimizer_tx.init(params_from_ckpt)
+
+        # Create a new TrainState with the loaded parameters, re-initialized opt_state, and step=0
+        optim = TrainState(
+            step=0, # Default to step 0 if not present in old checkpoint
+            params=params_from_ckpt,
+            tx=optimizer_tx,
+            opt_state=new_opt_state,
+            apply_fn=TransporterNets().apply
+        )
+        print('Successfully migrated and loaded checkpoint (params restored, opt_state re-initialized, step set to 0).')
+      else:
+        print(f"Error: Checkpoint '{ckpt_path}' is not a dictionary. Cannot migrate. Using initial model state.")
+    else:
+      # Re-raise other ValueErrors
+      raise
+
+else:
+
+  # Training loop.
+  batch_size = 8
+  for train_iter in range(1, 40001):
+    batch_i = np.random.randint(dataset_size, size=batch_size)
+    text_feat = data_text_feats[batch_i, ...]
+    img = dataset['image'][batch_i, ...] / 255
+    img = np.concatenate((img, np.broadcast_to(coords[None, ...], (batch_size,) + coords.shape)), axis=3)
+
+    # Get onehot label maps.
+    pick_yx = np.zeros((batch_size, 2), dtype=np.int32)
+    pick_onehot = np.zeros((batch_size, 224, 224), dtype=np.float32)
+    place_onehot = np.zeros((batch_size, 224, 224), dtype=np.float32)
+    for i in range(len(batch_i)):
+      pick_y, pick_x  = dataset['pick_yx'][batch_i[i], :]
+      place_y, place_x = dataset['place_yx'][batch_i[i], :]
+      pick_onehot[i, pick_y, pick_x] = 1
+      place_onehot[i, place_y, place_x] = 1
+      # pick_onehot[i, ...] = scipy.ndimage.gaussian_filter(pick_onehot[i, ...], sigma=3)
+
+      # Data augmentation (random translation).
+      roll_y, roll_x = np.random.randint(-112, 112, size=2)
+      img[i, ...] = np.roll(img[i, ...], roll_y, axis=0)
+      img[i, ...] = np.roll(img[i, ...], roll_x, axis=1)
+      pick_onehot[i, ...] = np.roll(pick_onehot[i, ...], roll_y, axis=0)
+      pick_onehot[i, ...] = np.roll(pick_onehot[i, ...], roll_x, axis=1)
+      place_onehot[i, ...] = np.roll(place_onehot[i, ...], roll_y, axis=0)
+      place_onehot[i, ...] = np.roll(place_onehot[i, ...], roll_x, axis=1)
+      pick_yx[i, 0] = pick_y + roll_y
+      pick_yx[i, 1] = pick_x + roll_x
+
+    # Backpropagate.
+    batch = {}
+    batch['img'] = jnp.float32(img)
+    batch['text'] = jnp.float32(text_feat)
+    batch['pick_yx'] = jnp.int32(pick_yx)
+    batch['pick_onehot'] = jnp.float32(pick_onehot)
+    batch['place_onehot'] = jnp.float32(place_onehot)
+    rng, batch['rng'] = jax.random.split(rng)
+    optim, loss, _, _ = train_step(optim, batch)
+    writer.scalar('train/loss', loss, train_iter)
+
+    if train_iter % np.power(10, min(4, np.floor(np.log10(train_iter)))) == 0:
+      print(f'Train Step: {train_iter} Loss: {loss}')
+
+    if train_iter % 1000 == 0:
+      checkpoints.save_checkpoint('.', optim, train_iter, prefix='ckpt_', keep=100000, overwrite=True)
+
+
+
+user_input = 'Pick the yellow block and place it on the blue bowl.'  #@param {type:"string"}
+
+# Show camera image before pick and place.
+
+def run_cliport(obs, text, env=None, clip_model=None, coords=None, optim=None, eval_step_fn=None):
+  before = env.get_camera_image()
+  prev_obs = obs['image'].copy()
+
+  # Tokenize text and get CLIP features.
+  text_tokens = clip.tokenize(text).cuda()
+  with torch.no_grad():
+    text_feats = clip_model.encode_text(text_tokens).float()
+  text_feats /= text_feats.norm(dim=-1, keepdim=True)
+  text_feats = np.float32(text_feats.cpu())
+
+  # Normalize image and add batch dimension.
+  img = obs['image'][None, ...] / 255
+  img = np.concatenate((img, coords[None, ...]), axis=3)
+
+  # Run Transporter Nets to get pick and place heatmaps.
+  batch = {'img': jnp.float32(img), 'text': jnp.float32(text_feats)}
+  pick_map, place_map = eval_step_fn(optim, batch)
+  pick_map, place_map = np.float32(pick_map), np.float32(place_map)
+
+  # Get pick position.
+  pick_max = np.argmax(np.float32(pick_map)).squeeze()
+  pick_yx = (pick_max // 224, pick_max % 224)
+  pick_yx = np.clip(pick_yx, 20, 204)
+  pick_xyz = obs['xyzmap'][pick_yx[0], pick_yx[1]]
+
+  # Get place position.
+  place_max = np.argmax(np.float32(place_map)).squeeze()
+  place_yx = (place_max // 224, place_max % 224)
+  place_yx = np.clip(place_yx, 20, 204)
+  place_xyz = obs['xyzmap'][place_yx[0], place_yx[1]]
+
+  # Step environment.
+  act = {'pick': pick_xyz, 'place': place_xyz}
+  obs, _, _, _ = env.step(act)
+
+  # Show pick and place action.
+  plt.title(text)
+  plt.imshow(prev_obs)
+  plt.arrow(pick_yx[1], pick_yx[0], place_yx[1]-pick_yx[1], place_yx[0]-pick_yx[0], color='w', head_starts_at_zero=False, head_width=7, length_includes_head=True)
+  plt.show()
+
+  # Show debug plots.
+  plt.subplot(1, 2, 1)
+  plt.title('Pick Heatmap')
+  plt.imshow(pick_map.reshape(224, 224))
+  plt.subplot(1, 2, 2)
+  plt.title('Place Heatmap')
+  plt.imshow(place_map.reshape(224, 224))
+  plt.show()
+
+  # Show video of environment rollout.
+  debug_clip = ImageSequenceClip(env.cache_video, fps=25)
+  display(debug_clip.ipython_display(autoplay=1, loop=1, center=False))
+  env.cache_video = []
+
+  # Show camera image after pick and place.
+  plt.subplot(1, 2, 1)
+  plt.title('Before')
+  plt.imshow(before)
+  plt.subplot(1, 2, 2)
+  plt.title('After')
+  after = env.get_camera_image()
+  plt.imshow(after)
+  plt.show()
+
+  # return pick_xyz, place_xyz, pick_map, place_map, pick_yx, place_yx
+  return obs
+
+
+# ============================================================================
+# CLIPort Interface Class for easy integration
+# ============================================================================
+
+import os
+
+# Get the saycan directory for checkpoint paths
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class CLIPort:
+    """CLIPort model interface for text-conditioned pick and place.
+
+    This class provides a simple interface for using CLIPort without
+    needing to manage all the global variables and initialization.
+    """
+
+    def __init__(self):
+        self.clip_model = None
+        self.optim = None
+        self.coords = None
+        self._initialized = False
+
+    def _init(self):
+        """Lazy initialization of CLIP and Transporter models."""
+        if self._initialized:
+            return
+
+        print("Initializing CLIPort...")
+
+        # Initialize CLIP
+        self.clip_model, _ = clip.load("ViT-B/32")
+        if torch.cuda.is_available():
+            self.clip_model = self.clip_model.cuda()
+        self.clip_model.eval()
+
+        # Create coordinate tensor
+        h, w = 224, 224
+        y_coords = np.linspace(0, 1, h)
+        x_coords = np.linspace(0, 1, w)
+        xx, yy = np.meshgrid(x_coords, y_coords)
+        self.coords = np.stack([xx, yy], axis=-1).astype(np.float32)
+
+        # Initialize Transporter Net
+        rng = jax.random.PRNGKey(0)
+        rng, key = jax.random.split(rng)
+        init_img = jnp.ones((1, 224, 224, 5), jnp.float32)
+        init_text = jnp.ones((1, 512), jnp.float32)
+        init_params = TransporterNets().init(key, init_img, init_text)['params']
+
+        # Create optimizer state
+        optimizer_tx = optax.adam(learning_rate=1e-4)
+        self.optim = TrainState.create(
+            apply_fn=TransporterNets().apply,
+            params=init_params,
+            tx=optimizer_tx
+        )
+
+        # Try to load checkpoint
+        ckpt_path = os.path.join(SAYCAN_DIR, 'ckpt_40000')
+        if os.path.exists(ckpt_path):
+            try:
+                # Attempt to restore directly
+                self.optim = checkpoints.restore_checkpoint(ckpt_path, self.optim)
+                print(f"Loaded CLIPort checkpoint from {ckpt_path}")
+            except ValueError as e:
+                if "Missing field step" in str(e):
+                    print("Migrating old checkpoint format...")
+                    try:
+                        # Load the raw checkpoint data as a dictionary
+                        loaded_state_dict = checkpoints.restore_checkpoint(ckpt_path, target=None)
+                        if isinstance(loaded_state_dict, dict):
+                            # Extract parameters
+                            params_from_ckpt = loaded_state_dict.get('params', loaded_state_dict.get('target', init_params))
+                            # Re-initialize the opt_state with loaded parameters
+                            new_opt_state = optimizer_tx.init(params_from_ckpt)
+                            # Create a new TrainState with the loaded parameters
+                            self.optim = TrainState(
+                                step=0,
+                                params=params_from_ckpt,
+                                tx=optimizer_tx,
+                                opt_state=new_opt_state,
+                                apply_fn=TransporterNets().apply
+                            )
+                            print(f"Successfully migrated checkpoint from {ckpt_path}")
+                        else:
+                            print(f"Could not migrate checkpoint, using random initialization")
+                    except Exception as e2:
+                        print(f"Could not load CLIPort checkpoint: {e2}")
+                        print("Using random initialization - model may not perform well without training.")
+                else:
+                    raise
+            except Exception as e:
+                print(f"Could not load CLIPort checkpoint: {e}")
+                print("Using random initialization - model may not perform well without training.")
+        else:
+            print("No CLIPort checkpoint found, using random initialization.")
+            print("Run: python config.py  to download pretrained weights")
+
+        self._initialized = True
+
+    def encode_text(self, text):
+        """Encode text instruction using CLIP."""
+        with torch.no_grad():
+            tokens = clip.tokenize([text])
+            if torch.cuda.is_available():
+                tokens = tokens.cuda()
+            text_feats = self.clip_model.encode_text(tokens).float()
+            text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
+            text_feats = text_feats.cpu().numpy()
+        return text_feats.astype(np.float32)
+
+    def predict(self, observation, text):
+        """
+        Predict pick and place coordinates from text instruction.
+
+        Args:
+            observation: Dict with 'image' and 'xyzmap'
+            text: Text instruction string
+
+        Returns:
+            action: Dict with 'pick' and 'place' 3D coordinates
+        """
+        self._init()
+
+        # Get image and encode text
+        image = observation['image']
+        xyzmap = observation['xyzmap']
+        text_feats = self.encode_text(text)
+
+        # Prepare image batch
+        img = image[np.newaxis, ...] / 255.0
+        img = np.concatenate([img, self.coords[np.newaxis, ...]], axis=-1)
+
+        # Run inference
+        def eval_step(optim, batch):
+            pick_out, place_out = TransporterNets().apply(
+                {'params': optim.params}, batch['img'], batch['text']
+            )
+            return pick_out, place_out
+
+        batch = {'img': jnp.float32(img), 'text': jnp.float32(text_feats)}
+        pick_map, place_map = eval_step(self.optim, batch)
+        pick_map, place_map = np.float32(pick_map[0]), np.float32(place_map[0])
+
+        # Get pick position
+        pick_max = np.argmax(pick_map.flatten())
+        pick_y, pick_x = np.unravel_index(pick_max, (224, 224))
+        pick_y, pick_x = np.clip(pick_y, 20, 204), np.clip(pick_x, 20, 204)
+        pick_xyz = xyzmap[pick_y, pick_x]
+
+        # Get place position
+        place_max = np.argmax(place_map.flatten())
+        place_y, place_x = np.unravel_index(place_max, (224, 224))
+        place_y, place_x = np.clip(place_y, 20, 204), np.clip(place_x, 20, 204)
+        place_xyz = xyzmap[place_y, place_x]
+
+        return {
+            'pick': pick_xyz,
+            'place': place_xyz,
+            'pick_map': pick_map,
+            'place_map': place_map
+        }
+
+
+# Global CLIPort instance
+_cliport = None
+
+
+def get_cliport():
+    """Get or create the global CLIPort instance."""
+    global _cliport
+    if _cliport is None:
+        _cliport = CLIPort()
+    return _cliport
\ No newline at end of file
diff --git a/saycan/config.py b/saycan/config.py
new file mode 100644
index 0000000..837eb5d
--- /dev/null
+++ b/saycan/config.py
@@ -0,0 +1,143 @@
+"""
+SayCan Configuration and Asset Downloader.
+
+This module provides global configuration constants for the SayCan environment
+and handles downloading required assets (robot URDFs, model weights).
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., Brohan, A., Brown, N., Chebotar, Y., Cortes, Y., David, B.,
+    Finn, C., Fu, C., Gopalakrishnan, K., Hausman, K., Herzog, A., Ho, D.,
+    Hsu, J., Ibarz, J., Ichter, B., Irpan, A., Jang, E., Jang, R., Julian, R.,
+    ... & Zeng, A. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+import collections
+import datetime
+import os
+import random
+import threading
+import time
+
+import cv2  # Used by ViLD.
+import clip
+from easydict import EasyDict
+import flax
+from flax import linen as nn
+from flax.training import checkpoints
+from flax.metrics import tensorboard
+import imageio
+from heapq import nlargest
+import IPython
+import jax
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+from moviepy import ImageSequenceClip
+import numpy as np
+import optax
+import pickle
+from PIL import Image
+import pybullet
+import pybullet_data
+import tensorflow.compat.v1 as tf
+import torch
+from tqdm import tqdm
+
+import subprocess
+
+# Get the directory where this script is located
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def download_assets():
+    """Download PyBullet robot assets, ViLD model weights, and CLIPort checkpoint."""
+    # Change to saycan directory for downloads
+    original_dir = os.getcwd()
+    os.chdir(SAYCAN_DIR)
+
+    try:
+        # Download PyBullet assets (UR5e robot, Robotiq gripper, bowl)
+        if not os.path.exists('ur5e/ur5e.urdf'):
+            print("Downloading UR5e robot assets...")
+            subprocess.run(['gdown', '--id', '1Cc_fDSBL6QiDvNT4dpfAEbhbALSVoWcc'], check=True)
+            subprocess.run(['gdown', '--id', '1yOMEm-Zp_DL3nItG9RozPeJAmeOldekX'], check=True)
+            subprocess.run(['gdown', '--id', '1GsqNLhEl9dd4Mc3BM0dX3MibOI1FVWNM'], check=True)
+
+            print("Extracting assets...")
+            subprocess.run(['unzip', '-o', 'ur5e.zip'], check=True)
+            subprocess.run(['unzip', '-o', 'robotiq_2f_85.zip'], check=True)
+            subprocess.run(['unzip', '-o', 'bowl.zip'], check=True)
+
+        # Download ViLD pretrained model weights
+        if not os.path.exists('image_path_v2'):
+            print("Downloading ViLD model weights...")
+            # Try using wget with public URL since gsutil may not be available
+            os.makedirs('image_path_v2/variables', exist_ok=True)
+            base_url = 'https://storage.googleapis.com/cloud-tpu-checkpoints/detection/projects/vild/colab/image_path_v2/'
+            subprocess.run(['wget', '-q', base_url + 'saved_model.pb', '-O', 'image_path_v2/saved_model.pb'], check=False)
+            subprocess.run(['wget', '-q', base_url + 'variables/variables.data-00000-of-00001', '-O', 'image_path_v2/variables/variables.data-00000-of-00001'], check=False)
+            subprocess.run(['wget', '-q', base_url + 'variables/variables.index', '-O', 'image_path_v2/variables/variables.index'], check=False)
+
+        # Download CLIPort pretrained checkpoint
+        if not os.path.exists('cliport_checkpoint'):
+            print("Downloading CLIPort pretrained checkpoint...")
+            os.makedirs('cliport_checkpoint', exist_ok=True)
+            # CLIPort checkpoint from original SayCan paper
+            subprocess.run(['gdown', '--id', '1NqJDTyxZOOqvCM2RZthJT5qPX3Xi-a-g', '-O', 'cliport_checkpoint/checkpoint'], check=False)
+
+        # Download training dataset (optional, for fine-tuning)
+        if not os.path.exists('dataset-9999.pkl'):
+            print("Downloading CLIPort training dataset...")
+            subprocess.run(['gdown', '--id', '1yCz6C-6eLWb4SFYKdkM-wz5tlMjbG2h8'], check=False)
+    finally:
+        os.chdir(original_dir)
+
+# Call download_assets() only when this script is run directly, not when imported
+if __name__ == "__main__":
+    download_assets()
+
+# =============================================================================
+# Global Constants
+# =============================================================================
+
+# Objects that can be picked up
+PICK_TARGETS = {
+  "blue block": None,
+  "red block": None,
+  "green block": None,
+  "yellow block": None,
+}
+
+# RGBA colors for objects
+COLORS = {
+    "blue":   (78/255,  121/255, 167/255, 255/255),
+    "red":    (255/255,  87/255,  89/255, 255/255),
+    "green":  (89/255,  169/255,  79/255, 255/255),
+    "yellow": (237/255, 201/255,  72/255, 255/255),
+}
+
+# Target locations for placing objects (None = dynamic, tuple = fixed position)
+PLACE_TARGETS = {
+  "blue block": None,
+  "red block": None,
+  "green block": None,
+  "yellow block": None,
+
+  "blue bowl": None,
+  "red bowl": None,
+  "green bowl": None,
+  "yellow bowl": None,
+
+  "top left corner":     (-0.3 + 0.05, -0.2 - 0.05, 0),
+  "top right corner":    (0.3 - 0.05,  -0.2 - 0.05, 0),
+  "middle":              (0,           -0.5,        0),
+  "bottom left corner":  (-0.3 + 0.05, -0.8 + 0.05, 0),
+  "bottom right corner": (0.3 - 0.05,  -0.8 + 0.05, 0),
+}
+
+# Workspace configuration
+PIXEL_SIZE = 0.00267857  # Meters per pixel
+BOUNDS = np.float32([[-0.3, 0.3], [-0.8, -0.2], [0, 0.15]])  # X, Y, Z bounds in meters
\ No newline at end of file
diff --git a/saycan/datasets.py b/saycan/datasets.py
new file mode 100644
index 0000000..063198b
--- /dev/null
+++ b/saycan/datasets.py
@@ -0,0 +1,60 @@
+#@markdown Collect demonstrations with a scripted expert, or download a pre-generated dataset.
+load_pregenerated = True  #@param {type:"boolean"}
+
+# Load pre-existing dataset.
+if load_pregenerated:
+  if not os.path.exists('dataset-9999.pkl'):
+    # !gdown --id 1TECwTIfawxkRYbzlAey0z1mqXKcyfPc-
+    !gdown --id 1yCz6C-6eLWb4SFYKdkM-wz5tlMjbG2h8
+  dataset = pickle.load(open('dataset-9999.pkl', 'rb'))  # ~10K samples.
+  dataset_size = len(dataset['text'])
+
+# Generate new dataset.
+else:
+  dataset = {}
+  dataset_size = 2  # Size of new dataset.
+  dataset['image'] = np.zeros((dataset_size, 224, 224, 3), dtype=np.uint8)
+  dataset['pick_yx'] = np.zeros((dataset_size, 2), dtype=np.int32)
+  dataset['place_yx'] = np.zeros((dataset_size, 2), dtype=np.int32)
+  dataset['text'] = []
+  policy = ScriptedPolicy(env)
+  data_idx = 0
+  while data_idx < dataset_size:
+    np.random.seed(data_idx)
+    num_pick, num_place = 3, 3
+
+    # Select random objects for data collection.
+    pick_items = list(PICK_TARGETS.keys())
+    pick_items = np.random.choice(pick_items, size=num_pick, replace=False)
+    place_items = list(PLACE_TARGETS.keys())
+    for pick_item in pick_items:  # For simplicity: place items != pick items.
+      place_items.remove(pick_item)
+    place_items = np.random.choice(place_items, size=num_place, replace=False)
+    config = {'pick': pick_items, 'place': place_items}
+
+    # Initialize environment with selected objects.
+    obs = env.reset(config)
+
+    # Create text prompts.
+    prompts = []
+    for i in range(len(pick_items)):
+      pick_item = pick_items[i]
+      place_item = place_items[i]
+      prompts.append(f'Pick the {pick_item} and place it on the {place_item}.')
+
+    # Execute 3 pick and place actions.
+    for prompt in prompts:
+      act = policy.step(prompt, obs)
+      dataset['text'].append(prompt)
+      dataset['image'][data_idx, ...] = obs['image'].copy()
+      dataset['pick_yx'][data_idx, ...] = xyz_to_pix(act['pick'])
+      dataset['place_yx'][data_idx, ...] = xyz_to_pix(act['place'])
+      data_idx += 1
+      obs, _, _, _ = env.step(act)
+      debug_clip = ImageSequenceClip(env.cache_video, fps=25)
+      display(debug_clip.ipython_display(autoplay=1, loop=1))
+      env.cache_video = []
+      if data_idx >= dataset_size:
+        break
+
+  pickle.dump(dataset, open(f'dataset-{dataset_size}.pkl', 'wb'))
\ No newline at end of file
diff --git a/saycan/environment.py b/saycan/environment.py
new file mode 100644
index 0000000..6e276ae
--- /dev/null
+++ b/saycan/environment.py
@@ -0,0 +1,310 @@
+"""
+SayCan Environment Wrapper for SHARPIE.
+
+This module wraps the PickPlaceEnv from the SayCan codebase to work with the
+SHARPIE experiment framework. It integrates:
+- ViLD for open-vocabulary object detection
+- LLM (via Ollama) for task planning and action scoring
+- CLIPort for language-conditioned pick-and-place manipulation
+
+Action Types:
+- "task:<description>" - Set task and auto-plan first action
+- "plan" - Get next planned action from LLM
+- "<text instruction>" - Direct CLIPort instruction
+- "done" - End episode
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., Brohan, A., Brown, N., Chebotar, Y., Cortes, Y., David, B.,
+    Finn, C., Fu, C., Gopalakrishnan, K., Hausman, K., Herzog, A., Ho, D.,
+    Hsu, J., Ibarz, J., Ichter, B., Irpan, A., Jang, E., Jang, R., Julian, R.,
+    ... & Zeng, A. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+import os
+import sys
+import tempfile
+import numpy as np
+from PIL import Image
+
+# Add the saycan directory to path for imports
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+if SAYCAN_DIR not in sys.path:
+    sys.path.insert(0, SAYCAN_DIR)
+
+from pick_place_env import PickPlaceEnv
+from config import PICK_TARGETS, PLACE_TARGETS
+from cliport import get_cliport
+# Import LLM and helpers for planning
+from llm import make_options, gpt3_scoring, gpt3_context, termination_string
+from helpers import normalize_scores, step_to_nlp, affordance_scoring
+from vild import vild, category_name_string, vild_params
+
+
+class EnvironmentWrapper:
+    """Wrapper for the SayCan PickPlaceEnv with LLM planning and CLIPort integration."""
+
+    def __init__(self):
+        """Initialize the environment."""
+        self.env = PickPlaceEnv()
+        self.config = None
+        self._step_count = 0
+        self._max_steps = 100
+        self._cliport = None
+        self.cached_video_frames = []
+
+        # LLM planning state
+        self._current_task = None
+        self._max_tasks = 10
+        self._gpt3_prompt = None
+        self._options = None
+        self._found_objects = None
+        self._task_step_count = 0
+
+    def reset(self, config=None):
+        """
+        Reset the environment to an initial state.
+
+        Args:
+            config: Optional configuration dict with 'pick' and 'place' lists.
+                   If None, uses default objects.
+
+        Returns:
+            observation: Initial observation dict with 'image', 'xyzmap', 'pick', 'place'
+            info: Additional information dict
+        """
+        self._step_count = 0
+        self.cached_video_frames = []
+
+        # Reset LLM planning state
+        self._current_task = None
+        self._gpt3_prompt = None
+        self._options = None
+        self._found_objects = None
+        self._task_step_count = 0
+
+        if config is None:
+            config = {'pick':  ['yellow block', 'blue block', 'red block'],
+                      'place': ['blue bowl', 'red bowl']}
+
+        self.config = config
+        observation = self.env.reset(config)
+
+        info = {
+            "step": 0,
+            "config": config,
+            "pick_objects": config.get("pick", []),
+            "place_objects": config.get("place", [])
+        }
+
+        return observation, info
+
+    def set_task(self, task_text):
+        """
+        Set the current task from natural language.
+
+        Args:
+            task_text: Task instruction (e.g., "put all the blocks in different corners")
+        """
+        self._current_task = task_text
+        self._gpt3_prompt = gpt3_context + "\n# " + task_text + "\n"
+        self._task_step_count = 0
+        self._found_objects = None
+        self._options = None
+        print(f"Environment: Task set to '{task_text}'")
+
+    def detect_objects(self, observation=None):
+        """
+        Detect objects in the scene using ViLD.
+
+        Args:
+            observation: Observation dict with 'image'. If None, uses current observation.
+
+        Returns:
+            found_objects: List of detected object names
+        """
+        if observation is None:
+            observation = self.env.get_observation()
+
+        # Save image to temp file for ViLD
+        image = observation['image']
+        with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as f:
+            temp_path = f.name
+            Image.fromarray(image).save(temp_path)
+
+        try:
+            # Run ViLD detection
+            prompt_swaps = [('block', 'cube')]
+            found_objects = vild(temp_path, category_name_string, vild_params,
+                                plot_on=False, prompt_swaps=prompt_swaps)
+            print(f"Environment: Detected objects: {found_objects}")
+        finally:
+            # Clean up temp file
+            os.unlink(temp_path)
+
+        return found_objects
+
+    def plan_next_action(self, observation=None):
+        """
+        Plan the next action using LLM + affordance scoring.
+
+        Args:
+            observation: Current observation. If None, uses current observation.
+
+        Returns:
+            action_text: Natural language action instruction
+            done: Whether the task is complete
+        """
+        if observation is None:
+            observation = self.env.get_observation()
+
+        # Detect objects if not already done
+        if self._found_objects is None:
+            self._found_objects = self.detect_objects(observation)
+
+        # Create options if not already done
+        if self._options is None:
+            self._options = make_options(PICK_TARGETS, PLACE_TARGETS,
+                                         termination_string=termination_string)
+
+        # Calculate affordance scores based on detected objects
+        affordance_scores = affordance_scoring(self._options, self._found_objects,
+                                               block_name="box", bowl_name="circle",
+                                               verbose=False)
+
+        # Get LLM scores
+        llm_scores, _ = gpt3_scoring(self._gpt3_prompt, self._options, verbose=True)
+
+        # Combine scores
+        combined_scores = {
+            option: np.exp(llm_scores[option]) * affordance_scores[option]
+            for option in self._options
+        }
+        combined_scores = normalize_scores(combined_scores)
+
+        # Select best action
+        selected_task = max(combined_scores, key=combined_scores.get)
+
+        # Check for termination
+        if selected_task == termination_string:
+            print("Environment: Task completed (termination signal)")
+            return "done", True
+
+        # Update prompt for next step
+        self._gpt3_prompt += selected_task + "\n"
+        self._task_step_count += 1
+
+        # Check max tasks limit
+        if self._task_step_count >= self._max_tasks:
+            print("Environment: Max steps reached")
+            return "done", True
+
+        # Convert to natural language
+        action_text = step_to_nlp(selected_task)
+        print(f"Environment: Step {self._task_step_count} - {action_text}")
+        return action_text, False
+
+    def step(self, action_dict):
+        """
+        Execute one step in the environment.
+
+        Args:
+            action_dict: Dictionary with agent id as keys and action as value.
+                        Action can be:
+                        - string text instruction directly
+                        - "task:<description>" to set a task and auto-plan
+                        - "plan" to get the next planned action
+                        - "done" to end the episode
+
+        Returns:
+            observation: New observation dict
+            reward: Reward for the action (float)
+            terminated: Whether the episode has ended (bool)
+            truncated: Whether the episode was truncated (bool)
+            info: Additional information (dict)
+        """
+        self._step_count += 1
+
+        if len(self.cached_video_frames) > 0:
+            return np.array([]), 0.0, False, False, {"info": "No action taken"}
+
+        # Extract action from dict (single-agent environment)
+        action = list(action_dict.values())[0] if isinstance(action_dict, dict) else action_dict
+
+        # Handle different action types
+        if action == 'done':
+            return np.array([]), 0.0, True, False, {"info": "Task completed"}
+        elif isinstance(action, str) and action.startswith('task:'):
+            # Set task and plan first action
+            task_text = action[5:].strip()
+            self.set_task(task_text)
+            action_text, task_done = self.plan_next_action()
+            if task_done or action_text == "done":
+                return np.array([]), 0.0, False, False, {"info": "Task completed"}
+            action = action_text
+        elif action == 'plan':
+            # Get next planned action
+            if self._current_task is None:
+                return np.array([]), 0.0, False, False, {"info": "No task set"}
+            # Re-detect objects since they may have moved
+            self._found_objects = None
+            action_text, task_done = self.plan_next_action()
+            if task_done or action_text == "done":
+                return np.array([]), 0.0, False, False, {"info": "Task completed"}
+            action = action_text
+
+        if action and action != "done":
+            # Direct text instruction
+            obs, reward, _, info = self._step_with_text(action)
+            # Get the frames buffer
+            self.cached_video_frames = self.env.cache_video
+        else:
+            return np.array([]), 0.0, False, False, {"info": "No action taken"}
+
+        # Check termination conditions
+        terminated = False
+        truncated = self._step_count >= self._max_steps
+
+        info["step"] = self._step_count
+        info["max_steps"] = self._max_steps
+
+        return obs, reward, terminated, truncated, info
+
+    def _step_with_text(self, text):
+        """Execute a step using CLIPort with text instruction."""
+        if self._cliport is None:
+            self._cliport = get_cliport()
+
+        # Get current observation
+        obs = self.env.get_observation()
+
+        # Use CLIPort to predict action
+        action = self._cliport.predict(obs, text)
+
+        # Execute the predicted action
+        obs, reward, done, info = self.env.step({
+            'pick': action['pick'],
+            'place': action['place']
+        })
+
+        info['text_instruction'] = text
+        info['cliport_action'] = action
+
+        return obs, reward, done, info
+
+    def render(self):
+        """Render the environment."""
+        if len(self.cached_video_frames) > 0:
+            return self.cached_video_frames.pop(0)
+        return self.env.get_camera_image()
+
+    def get_observation(self):
+        """Get current observation without stepping."""
+        return self.env.get_observation()
+
+
+# Create the environment instance for SHARPIE runner
+environment = EnvironmentWrapper()
\ No newline at end of file
diff --git a/saycan/helpers.py b/saycan/helpers.py
new file mode 100644
index 0000000..93c1028
--- /dev/null
+++ b/saycan/helpers.py
@@ -0,0 +1,111 @@
+"""
+SayCan Helper Functions.
+
+Utility functions for affordance scoring, scene description, and visualization.
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., et al. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from heapq import nlargest
+from config import PLACE_TARGETS
+
+def build_scene_description(found_objects, block_name="box", bowl_name="circle"):
+  scene_description = f"objects = {found_objects}"
+  scene_description = scene_description.replace(block_name, "block")
+  scene_description = scene_description.replace(bowl_name, "bowl")
+  scene_description = scene_description.replace("'", "")
+  return scene_description
+
+def step_to_nlp(step):
+  step = step.replace("robot.pick_and_place(", "")
+  step = step.replace(")", "")
+  pick, place = step.split(", ")
+  return "Pick the " + pick + " and place it on the " + place + "."
+
+def normalize_scores(scores):
+  max_score = max(scores.values())
+  normed_scores = {key: np.clip(scores[key] / max_score, 0, 1) for key in scores}
+  return normed_scores
+
+def plot_saycan(llm_scores, vfs, combined_scores, task, correct=True, show_top=None):
+  if show_top:
+    top_options = nlargest(show_top, combined_scores, key = combined_scores.get)
+    # add a few top llm options in if not already shown
+    top_llm_options = nlargest(show_top // 2, llm_scores, key = llm_scores.get)
+    for llm_option in top_llm_options:
+      if not llm_option in top_options:
+        top_options.append(llm_option)
+    llm_scores = {option: llm_scores[option] for option in top_options}
+    vfs = {option: vfs[option] for option in top_options}
+    combined_scores = {option: combined_scores[option] for option in top_options}
+
+  sorted_keys = dict(sorted(combined_scores.items()))
+  keys = [key for key in sorted_keys]
+  positions = np.arange(len(combined_scores.items()))
+  width = 0.3
+
+  fig = plt.figure(figsize=(12, 6))
+  ax1 = fig.add_subplot(1,1,1)
+
+  plot_llm_scores = normalize_scores({key: np.exp(llm_scores[key]) for key in sorted_keys})
+  plot_llm_scores = np.asarray([plot_llm_scores[key] for key in sorted_keys])
+  plot_affordance_scores = np.asarray([vfs[key] for key in sorted_keys])
+  plot_combined_scores = np.asarray([combined_scores[key] for key in sorted_keys])
+
+  ax1.bar(positions, plot_combined_scores, 3 * width, alpha=0.6, color="#93CE8E", label="combined")
+
+  score_colors = ["#ea9999ff" for score in plot_affordance_scores]
+  ax1.bar(positions + width / 2, 0 * plot_combined_scores, width, color="#ea9999ff", label="vfs")
+  ax1.bar(positions + width / 2, 0 * plot_combined_scores, width, color="#a4c2f4ff", label="language")
+  ax1.bar(positions - width / 2, np.abs(plot_affordance_scores), width, color=score_colors)
+
+  plt.xticks(rotation="vertical")
+  ax1.set_ylim(0.0, 1.0)
+
+  ax1.grid(True, which="both")
+  ax1.axis("on")
+
+  ax1_llm = ax1.twinx()
+  ax1_llm.bar(positions + width / 2, plot_llm_scores, width, color="#a4c2f4ff", label="language")
+  ax1_llm.set_ylim(0.01, 1.0)
+  plt.yscale("log")
+
+  font = {"fontname":"Arial", "size":"16", "color":"k" if correct else "r"}
+  plt.title(task, **font)
+  key_strings = [key.replace("robot.pick_and_place", "").replace(", ", " to ").replace("(", "").replace(")","") for key in keys]
+  plt.xticks(positions, key_strings, **font)
+  ax1.legend()
+  plt.show()
+
+
+
+#@title Affordance Scoring
+#@markdown Given this environment does not have RL-trained policies or an asscociated value function, we use affordances through an object detector.
+
+def affordance_scoring(options, found_objects, verbose=False, block_name="box", bowl_name="circle", termination_string="done()"):
+  affordance_scores = {}
+  found_objects = [
+                   found_object.replace(block_name, "block").replace(bowl_name, "bowl") 
+                   for found_object in found_objects + list(PLACE_TARGETS.keys())[-5:]]
+  verbose and print("found_objects", found_objects)
+  for option in options:
+    if option == termination_string:
+      affordance_scores[option] = 0.2
+      continue
+    pick, place = option.replace("robot.pick_and_place(", "").replace(")", "").split(", ")
+    affordance = 0
+    found_objects_copy = found_objects.copy()
+    if pick in found_objects_copy:
+      found_objects_copy.remove(pick)
+      if place in found_objects_copy:
+        affordance = 1
+    affordance_scores[option] = affordance
+    verbose and print(affordance, '\t', option)
+  return affordance_scores
\ No newline at end of file
diff --git a/saycan/llm.py b/saycan/llm.py
new file mode 100644
index 0000000..34e1058
--- /dev/null
+++ b/saycan/llm.py
@@ -0,0 +1,170 @@
+"""
+SayCan LLM Module - Language Model Integration for Task Planning.
+
+This module provides integration with Large Language Models (LLMs) for task
+planning and action scoring. Originally designed for GPT-3, now adapted for
+local Ollama-based models.
+
+The LLM provides:
+- Few-shot prompting for task decomposition
+- Action scoring based on task context
+- Natural language to action mapping
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., et al. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+import ollama
+
+# Ollama client configuration
+client = ollama.Client(host='http://localhost:11434')
+ENGINE = "llama3.2:1b"
+
+from config import PICK_TARGETS, PLACE_TARGETS
+
+# LLM Cache for repeated queries
+LLM_CACHE = {}
+
+
+def gpt3_call(engine=ENGINE, prompt="", max_tokens=128, temperature=0):
+    """
+    Call the LLM with caching for repeated queries.
+
+    Args:
+        engine: Model name to use
+        prompt: Input prompt string
+        max_tokens: Maximum tokens to generate
+        temperature: Sampling temperature
+
+    Returns:
+        Generated text response
+    """
+    cache_id = (engine, prompt, max_tokens, temperature)
+    if cache_id in LLM_CACHE:
+        print('cache hit, returning cached response')
+        return LLM_CACHE[cache_id]
+
+    ollama_options = {}
+    if max_tokens > 0:
+        ollama_options['num_predict'] = max_tokens
+    if temperature > 0:
+        ollama_options['temperature'] = temperature
+
+    response = client.generate(model=engine, prompt=prompt, options=ollama_options)
+    generated_text = response['response']
+    LLM_CACHE[cache_id] = generated_text
+    return generated_text
+
+
+def gpt3_scoring(query, options, engine=ENGINE, limit_num_options=None, option_start="\n", verbose=False, print_tokens=False):
+    """
+    Score action options using the LLM.
+
+    Note: For local models without log probability access, this returns
+    uniform scores. The actual discrimination comes from affordance scoring.
+
+    Args:
+        query: Prompt context for scoring
+        options: List of action options to score
+        engine: Model name
+        limit_num_options: Limit number of options to score
+        option_start: Prefix for options (unused)
+        verbose: Print scoring details
+        print_tokens: Print token details (unused)
+
+    Returns:
+        Tuple of (scores dict, empty response dict)
+    """
+    if limit_num_options:
+        options = options[:limit_num_options]
+    verbose and print("Scoring", len(options), "options with uniform LLM scores.")
+
+    # Uniform scores since local models don't provide log probs
+    uniform_logprob = 0.0
+    scores = {option: uniform_logprob for option in options}
+
+    if verbose:
+        for i, (option, score) in enumerate(sorted(scores.items(), key=lambda x: -x[1])):
+            print(score, "\t", option)
+            if i >= 10:
+                break
+
+    return scores, {}
+
+
+def make_options(pick_targets=None, place_targets=None, options_in_api_form=True, termination_string="done()"):
+    """
+    Generate all possible pick-and-place action options.
+
+    Args:
+        pick_targets: Dict of pickable objects (uses PICK_TARGETS if None)
+        place_targets: Dict of place targets (uses PLACE_TARGETS if None)
+        options_in_api_form: If True, use API format; otherwise natural language
+        termination_string: String to append for task completion
+
+    Returns:
+        List of action option strings
+    """
+    if not pick_targets:
+        pick_targets = PICK_TARGETS
+    if not place_targets:
+        place_targets = PLACE_TARGETS
+
+    options = []
+    for pick in pick_targets:
+        for place in place_targets:
+            if options_in_api_form:
+                option = f"robot.pick_and_place({pick}, {place})"
+            else:
+                option = f"Pick the {pick} and place it on the {place}."
+            options.append(option)
+
+    options.append(termination_string)
+    print("Considering", len(options), "options")
+    return options
+
+
+# Termination string for task completion
+termination_string = "done()"
+
+# Few-shot prompt examples for task decomposition
+gpt3_context = """
+objects = [red block, yellow block, blue block, green bowl]
+# move all the blocks to the top left corner.
+robot.pick_and_place(blue block, top left corner)
+robot.pick_and_place(red block, top left corner)
+robot.pick_and_place(yellow block, top left corner)
+done()
+
+objects = [red block, yellow block, blue block, green bowl]
+# put the yellow one the green thing.
+robot.pick_and_place(yellow block, green bowl)
+done()
+
+objects = [yellow block, blue block, red block]
+# move the light colored block to the middle.
+robot.pick_and_place(yellow block, middle)
+done()
+
+objects = [blue block, green bowl, red block, yellow bowl, green block]
+# stack the blocks.
+robot.pick_and_place(green block, blue block)
+robot.pick_and_place(red block, green block)
+done()
+
+objects = [red block, blue block, green bowl, blue bowl, yellow block, green block]
+# group the blue objects together.
+robot.pick_and_place(blue block, blue bowl)
+done()
+
+objects = [green bowl, red block, green block, red bowl, yellow bowl, yellow block]
+# sort all the blocks into their matching color bowls.
+robot.pick_and_place(green block, green bowl)
+robot.pick_and_place(red block, red bowl)
+robot.pick_and_place(yellow block, yellow bowl)
+done()
+"""
\ No newline at end of file
diff --git a/saycan/pick_place_env.py b/saycan/pick_place_env.py
new file mode 100644
index 0000000..892d94c
--- /dev/null
+++ b/saycan/pick_place_env.py
@@ -0,0 +1,487 @@
+"""
+SayCan Pick and Place Environment.
+
+A Gym-style PyBullet environment for robotic pick-and-place manipulation tasks.
+This environment simulates a UR5e robot arm with a Robotiq 2F-85 gripper
+manipulating blocks and bowls on a workspace.
+
+Key Features:
+- UR5e robot arm with Robotiq 2F-85 gripper
+- Configurable objects (blocks and bowls of various colors)
+- Pick-and-place motion primitives
+- RGB-D observation with heightmap generation
+- Video recording of episodes
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., et al. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+import os
+import numpy as np
+import pybullet
+import pybullet_data
+from robot import Robotiq2F85
+from config import COLORS, BOUNDS, PIXEL_SIZE, SAYCAN_DIR
+
+class PickPlaceEnv():
+
+  def __init__(self):
+    self.dt = 1/480
+    self.sim_step = 0
+
+    # Configure and start PyBullet.
+    # python3 -m pybullet_utils.runServer
+    # pybullet.connect(pybullet.SHARED_MEMORY)  # pybullet.GUI for local GUI.
+    pybullet.connect(pybullet.DIRECT)  # pybullet.GUI for local GUI.
+    pybullet.configureDebugVisualizer(pybullet.COV_ENABLE_GUI, 0)
+    pybullet.setPhysicsEngineParameter(enableFileCaching=0)
+    pybullet.setAdditionalSearchPath(SAYCAN_DIR)
+    pybullet.setAdditionalSearchPath(pybullet_data.getDataPath())
+    pybullet.setTimeStep(self.dt)
+
+    self.home_joints = (np.pi / 2, -np.pi / 2, np.pi / 2, -np.pi / 2, 3 * np.pi / 2, 0)  # Joint angles: (J0, J1, J2, J3, J4, J5).
+    self.home_ee_euler = (np.pi, 0, np.pi)  # (RX, RY, RZ) rotation in Euler angles.
+    self.ee_link_id = 9  # Link ID of UR5 end effector.
+    self.tip_link_id = 10  # Link ID of gripper finger tips.
+    self.gripper = None
+
+  def reset(self, config):
+    pybullet.resetSimulation(pybullet.RESET_USE_DEFORMABLE_WORLD)
+    pybullet.setGravity(0, 0, -9.8)
+    self.cache_video = []
+
+    # Temporarily disable rendering to load URDFs faster.
+    pybullet.configureDebugVisualizer(pybullet.COV_ENABLE_RENDERING, 0)
+
+    # Add ground plane (from pybullet_data) and robot (from saycan directory).
+    pybullet.loadURDF("plane.urdf", [0, 0, -0.001])
+    ur5e_urdf = os.path.join(SAYCAN_DIR, "ur5e", "ur5e.urdf")
+    self.robot_id = pybullet.loadURDF(ur5e_urdf, [0, 0, 0], flags=pybullet.URDF_USE_MATERIAL_COLORS_FROM_MTL)
+    self.ghost_id = pybullet.loadURDF(ur5e_urdf, [0, 0, -10])  # For forward kinematics.
+    self.joint_ids = [pybullet.getJointInfo(self.robot_id, i) for i in range(pybullet.getNumJoints(self.robot_id))]
+    self.joint_ids = [j[0] for j in self.joint_ids if j[2] == pybullet.JOINT_REVOLUTE]
+
+    # Move robot to home configuration.
+    for i in range(len(self.joint_ids)):
+      pybullet.resetJointState(self.robot_id, self.joint_ids[i], self.home_joints[i])
+
+    # Add gripper.
+    if self.gripper is not None:
+      while self.gripper.constraints_thread.is_alive():
+        self.constraints_thread_active = False
+    self.gripper = Robotiq2F85(self.robot_id, self.ee_link_id)
+    self.gripper.release()
+
+    # Add workspace.
+    plane_shape = pybullet.createCollisionShape(pybullet.GEOM_BOX, halfExtents=[0.3, 0.3, 0.001])
+    plane_visual = pybullet.createVisualShape(pybullet.GEOM_BOX, halfExtents=[0.3, 0.3, 0.001])
+    plane_id = pybullet.createMultiBody(0, plane_shape, plane_visual, basePosition=[0, -0.5, 0])
+    pybullet.changeVisualShape(plane_id, -1, rgbaColor=[0.2, 0.2, 0.2, 1.0])
+
+    # Load objects according to config.
+    self.config = config
+    self.obj_name_to_id = {}
+    obj_names = list(self.config["pick"]) + list(self.config["place"])
+    obj_xyz = np.zeros((0, 3))
+    for obj_name in obj_names:
+      if ("block" in obj_name) or ("bowl" in obj_name):
+
+        # Get random position 15cm+ from other objects.
+        while True:
+          rand_x = np.random.uniform(BOUNDS[0, 0] + 0.1, BOUNDS[0, 1] - 0.1)
+          rand_y = np.random.uniform(BOUNDS[1, 0] + 0.1, BOUNDS[1, 1] - 0.1)
+          rand_xyz = np.float32([rand_x, rand_y, 0.03]).reshape(1, 3)
+          if len(obj_xyz) == 0:
+            obj_xyz = np.concatenate((obj_xyz, rand_xyz), axis=0)
+            break
+          else:
+            nn_dist = np.min(np.linalg.norm(obj_xyz - rand_xyz, axis=1)).squeeze()
+            if nn_dist > 0.15:
+              obj_xyz = np.concatenate((obj_xyz, rand_xyz), axis=0)
+              break
+
+        object_color = COLORS[obj_name.split(" ")[0]]
+        object_type = obj_name.split(" ")[1]
+        object_position = rand_xyz.squeeze()
+        if object_type == "block":
+          object_shape = pybullet.createCollisionShape(pybullet.GEOM_BOX, halfExtents=[0.02, 0.02, 0.02])
+          object_visual = pybullet.createVisualShape(pybullet.GEOM_BOX, halfExtents=[0.02, 0.02, 0.02])
+          object_id = pybullet.createMultiBody(0.01, object_shape, object_visual, basePosition=object_position)
+        elif object_type == "bowl":
+          object_position[2] = 0
+          bowl_urdf = os.path.join(SAYCAN_DIR, "bowl", "bowl.urdf")
+          object_id = pybullet.loadURDF(bowl_urdf, object_position, useFixedBase=1)
+        pybullet.changeVisualShape(object_id, -1, rgbaColor=object_color)
+        self.obj_name_to_id[obj_name] = object_id
+
+    # Re-enable rendering.
+    pybullet.configureDebugVisualizer(pybullet.COV_ENABLE_RENDERING, 1)
+
+    for _ in range(200):
+      pybullet.stepSimulation()
+    return self.get_observation()
+
+  def servoj(self, joints):
+    """Move to target joint positions with position control."""
+    pybullet.setJointMotorControlArray(
+      bodyIndex=self.robot_id,
+      jointIndices=self.joint_ids,
+      controlMode=pybullet.POSITION_CONTROL,
+      targetPositions=joints,
+      positionGains=[0.01]*6)
+
+  def movep(self, position):
+    """Move to target end effector position."""
+    joints = pybullet.calculateInverseKinematics(
+        bodyUniqueId=self.robot_id,
+        endEffectorLinkIndex=self.tip_link_id,
+        targetPosition=position,
+        targetOrientation=pybullet.getQuaternionFromEuler(self.home_ee_euler),
+        maxNumIterations=100)
+    self.servoj(joints)
+
+  def step(self, action=None):
+    """Do pick and place motion primitive."""
+    pick_xyz, place_xyz = action["pick"].copy(), action["place"].copy()
+
+    # Set fixed primitive z-heights.
+    hover_xyz = pick_xyz.copy() + np.float32([0, 0, 0.2])
+    pick_xyz[2] = 0.03
+    place_xyz[2] = 0.15
+
+    # Move to object.
+    ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+    while np.linalg.norm(hover_xyz - ee_xyz) > 0.01:
+      self.movep(hover_xyz)
+      self.step_sim_and_render()
+      ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+    while np.linalg.norm(pick_xyz - ee_xyz) > 0.01:
+      self.movep(pick_xyz)
+      self.step_sim_and_render()
+      ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+
+    # Pick up object.
+    self.gripper.activate()
+    for _ in range(240):
+      self.step_sim_and_render()
+    while np.linalg.norm(hover_xyz - ee_xyz) > 0.01:
+      self.movep(hover_xyz)
+      self.step_sim_and_render()
+      ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+
+    # Move to place location.
+    while np.linalg.norm(place_xyz - ee_xyz) > 0.01:
+      self.movep(place_xyz)
+      self.step_sim_and_render()
+      ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+
+    # Place down object.
+    while (not self.gripper.detect_contact()) and (place_xyz[2] > 0.03):
+      place_xyz[2] -= 0.001
+      self.movep(place_xyz)
+      for _ in range(3):
+        self.step_sim_and_render()
+    self.gripper.release()
+    for _ in range(240):
+      self.step_sim_and_render()
+    place_xyz[2] = 0.2
+    ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+    while np.linalg.norm(place_xyz - ee_xyz) > 0.01:
+      self.movep(place_xyz)
+      self.step_sim_and_render()
+      ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+    place_xyz = np.float32([0, -0.5, 0.2])
+    while np.linalg.norm(place_xyz - ee_xyz) > 0.01:
+      self.movep(place_xyz)
+      self.step_sim_and_render()
+      ee_xyz = np.float32(pybullet.getLinkState(self.robot_id, self.tip_link_id)[0])
+
+    observation = self.get_observation()
+    reward = self.get_reward()
+    done = False
+    info = {}
+    return observation, reward, done, info
+
+  def set_alpha_transparency(self, alpha: float) -> None:
+    for id in range(20):
+      visual_shape_data = pybullet.getVisualShapeData(id)
+      for i in range(len(visual_shape_data)):
+        object_id, link_index, _, _, _, _, _, rgba_color = visual_shape_data[i]
+        rgba_color = list(rgba_color[0:3]) +  [alpha]
+        pybullet.changeVisualShape(
+            self.robot_id, linkIndex=i, rgbaColor=rgba_color)
+        pybullet.changeVisualShape(
+            self.gripper.body, linkIndex=i, rgbaColor=rgba_color)
+
+  def step_sim_and_render(self):
+    pybullet.stepSimulation()
+    self.sim_step += 1
+
+    # Render current image at 8 FPS.
+    if self.sim_step % 60 == 0:
+      self.cache_video.append(self.get_camera_image())
+
+  def get_camera_image(self, resolution_factor=4):
+    """
+    Get camera image with adjustable resolution.
+
+    Args:
+        resolution_factor: Multiplier for resolution (default 4 = 960x960)
+                          1 = 240x240, 2 = 480x480, 3 = 720x720, 4 = 960x960
+    """
+    base_size = 240
+    base_focal = 120.
+    image_size = (base_size * resolution_factor, base_size * resolution_factor)
+    focal = base_focal * resolution_factor
+    intrinsics = (focal, 0, focal, 0, focal, focal, 0, 0, 1)
+    color, _, _, _, _ = self.render_image(image_size, intrinsics)
+    return color
+
+  def get_camera_image_top(self,
+                           image_size=(240, 240),
+                           intrinsics=(2000., 0, 2000., 0, 2000., 2000., 0, 0, 1),
+                           position=(0, -0.5, 5),
+                           orientation=(0, np.pi, -np.pi / 2),
+                           zrange=(0.01, 1.),
+                           set_alpha=True):
+    set_alpha and self.set_alpha_transparency(0)
+    color, _, _, _, _ = self.render_image_top(image_size,
+                                             intrinsics,
+                                             position,
+                                             orientation,
+                                             zrange)
+    set_alpha and self.set_alpha_transparency(1)
+    return color
+
+  def get_reward(self):
+    return 0  # TODO: check did the robot follow text instructions?
+
+  def get_observation(self):
+    observation = {}
+
+    # Render current image.
+    color, depth, position, orientation, intrinsics = self.render_image()
+
+    # Get heightmaps and colormaps.
+    points = self.get_pointcloud(depth, intrinsics)
+    position = np.float32(position).reshape(3, 1)
+    rotation = pybullet.getMatrixFromQuaternion(orientation)
+    rotation = np.float32(rotation).reshape(3, 3)
+    transform = np.eye(4)
+    transform[:3, :] = np.hstack((rotation, position))
+    points = self.transform_pointcloud(points, transform)
+    heightmap, colormap, xyzmap = self.get_heightmap(points, color, BOUNDS, PIXEL_SIZE)
+
+    observation["image"] = colormap
+    observation["xyzmap"] = xyzmap
+    observation["pick"] = list(self.config["pick"])
+    observation["place"] = list(self.config["place"])
+    return observation
+
+  def render_image(self, image_size=(720, 720), intrinsics=(360., 0, 360., 0, 360., 360., 0, 0, 1)):
+
+    # Camera parameters.
+    position = (0, -0.85, 0.4)
+    orientation = (np.pi / 4 + np.pi / 48, np.pi, np.pi)
+    orientation = pybullet.getQuaternionFromEuler(orientation)
+    zrange = (0.01, 10.)
+    noise=True
+
+    # OpenGL camera settings.
+    lookdir = np.float32([0, 0, 1]).reshape(3, 1)
+    updir = np.float32([0, -1, 0]).reshape(3, 1)
+    rotation = pybullet.getMatrixFromQuaternion(orientation)
+    rotm = np.float32(rotation).reshape(3, 3)
+    lookdir = (rotm @ lookdir).reshape(-1)
+    updir = (rotm @ updir).reshape(-1)
+    lookat = position + lookdir
+    focal_len = intrinsics[0]
+    znear, zfar = (0.01, 10.)
+    viewm = pybullet.computeViewMatrix(position, lookat, updir)
+    fovh = (image_size[0] / 2) / focal_len
+    fovh = 180 * np.arctan(fovh) * 2 / np.pi
+
+    # Notes: 1) FOV is vertical FOV 2) aspect must be float
+    aspect_ratio = image_size[1] / image_size[0]
+    projm = pybullet.computeProjectionMatrixFOV(fovh, aspect_ratio, znear, zfar)
+
+    # Render with OpenGL camera settings.
+    # Use brighter lighting to prevent colors from appearing dark/washed out
+    _, _, color, depth, segm = pybullet.getCameraImage(
+        width=image_size[1],
+        height=image_size[0],
+        viewMatrix=viewm,
+        projectionMatrix=projm,
+        shadow=1,
+        lightDirection=[0.5, 0.5, 1],
+        lightColor=[1.0, 1.0, 1.0],
+        lightDistance=2.0,
+        flags=pybullet.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX,
+        renderer=pybullet.ER_BULLET_HARDWARE_OPENGL)
+
+    # Get color image.
+    color_image_size = (image_size[0], image_size[1], 4)
+    color = np.array(color, dtype=np.uint8).reshape(color_image_size)
+    color = color[:, :, :3]  # remove alpha channel
+    if noise:
+      color = np.int32(color)
+      color += np.int32(np.random.normal(0, 3, color.shape))
+      color = np.uint8(np.clip(color, 0, 255))
+
+    # Get depth image.
+    depth_image_size = (image_size[0], image_size[1])
+    zbuffer = np.float32(depth).reshape(depth_image_size)
+    depth = (zfar + znear - (2 * zbuffer - 1) * (zfar - znear))
+    depth = (2 * znear * zfar) / depth
+    if noise:
+      depth += np.random.normal(0, 0.003, depth.shape)
+
+    intrinsics = np.float32(intrinsics).reshape(3, 3)
+    return color, depth, position, orientation, intrinsics
+
+  def render_image_top(self,
+                       image_size=(240, 240),
+                       intrinsics=(2000., 0, 2000., 0, 2000., 2000., 0, 0, 1),
+                       position=(0, -0.5, 5),
+                       orientation=(0, np.pi, -np.pi / 2),
+                       zrange=(0.01, 1.)):
+
+    # Camera parameters.
+    orientation = pybullet.getQuaternionFromEuler(orientation)
+    noise=True
+
+    # OpenGL camera settings.
+    lookdir = np.float32([0, 0, 1]).reshape(3, 1)
+    updir = np.float32([0, -1, 0]).reshape(3, 1)
+    rotation = pybullet.getMatrixFromQuaternion(orientation)
+    rotm = np.float32(rotation).reshape(3, 3)
+    lookdir = (rotm @ lookdir).reshape(-1)
+    updir = (rotm @ updir).reshape(-1)
+    lookat = position + lookdir
+    focal_len = intrinsics[0]
+    znear, zfar = (0.01, 10.)
+    viewm = pybullet.computeViewMatrix(position, lookat, updir)
+    fovh = (image_size[0] / 2) / focal_len
+    fovh = 180 * np.arctan(fovh) * 2 / np.pi
+
+    # Notes: 1) FOV is vertical FOV 2) aspect must be float
+    aspect_ratio = image_size[1] / image_size[0]
+    projm = pybullet.computeProjectionMatrixFOV(fovh, aspect_ratio, znear, zfar)
+
+    # Render with OpenGL camera settings.
+    # Use brighter lighting to prevent colors from appearing dark/washed out
+    _, _, color, depth, segm = pybullet.getCameraImage(
+        width=image_size[1],
+        height=image_size[0],
+        viewMatrix=viewm,
+        projectionMatrix=projm,
+        shadow=1,
+        lightDirection=[0.5, 0.5, 1],
+        lightColor=[1.0, 1.0, 1.0],
+        lightDistance=2.0,
+        flags=pybullet.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX,
+        renderer=pybullet.ER_BULLET_HARDWARE_OPENGL)
+
+    # Get color image.
+    color_image_size = (image_size[0], image_size[1], 4)
+    color = np.array(color, dtype=np.uint8).reshape(color_image_size)
+    color = color[:, :, :3]  # remove alpha channel
+    if noise:
+      color = np.int32(color)
+      color += np.int32(np.random.normal(0, 3, color.shape))
+      color = np.uint8(np.clip(color, 0, 255))
+
+    # Get depth image.
+    depth_image_size = (image_size[0], image_size[1])
+    zbuffer = np.float32(depth).reshape(depth_image_size)
+    depth = (zfar + znear - (2 * zbuffer - 1) * (zfar - znear))
+    depth = (2 * znear * zfar) / depth
+    if noise:
+      depth += np.random.normal(0, 0.003, depth.shape)
+
+    intrinsics = np.float32(intrinsics).reshape(3, 3)
+    return color, depth, position, orientation, intrinsics
+
+  def get_pointcloud(self, depth, intrinsics):
+    """Get 3D pointcloud from perspective depth image.
+    Args:
+      depth: HxW float array of perspective depth in meters.
+      intrinsics: 3x3 float array of camera intrinsics matrix.
+    Returns:
+      points: HxWx3 float array of 3D points in camera coordinates.
+    """
+    height, width = depth.shape
+    xlin = np.linspace(0, width - 1, width)
+    ylin = np.linspace(0, height - 1, height)
+    px, py = np.meshgrid(xlin, ylin)
+    px = (px - intrinsics[0, 2]) * (depth / intrinsics[0, 0])
+    py = (py - intrinsics[1, 2]) * (depth / intrinsics[1, 1])
+    points = np.float32([px, py, depth]).transpose(1, 2, 0)
+    return points
+
+  def transform_pointcloud(self, points, transform):
+    """Apply rigid transformation to 3D pointcloud.
+    Args:
+      points: HxWx3 float array of 3D points in camera coordinates.
+      transform: 4x4 float array representing a rigid transformation matrix.
+    Returns:
+      points: HxWx3 float array of transformed 3D points.
+    """
+    padding = ((0, 0), (0, 0), (0, 1))
+    homogen_points = np.pad(points.copy(), padding,
+                            "constant", constant_values=1)
+    for i in range(3):
+      points[Ellipsis, i] = np.sum(transform[i, :] * homogen_points, axis=-1)
+    return points
+
+  def get_heightmap(self, points, colors, bounds, pixel_size):
+    """Get top-down (z-axis) orthographic heightmap image from 3D pointcloud.
+    Args:
+      points: HxWx3 float array of 3D points in world coordinates.
+      colors: HxWx3 uint8 array of values in range 0-255 aligned with points.
+      bounds: 3x2 float array of values (rows: X,Y,Z; columns: min,max) defining
+        region in 3D space to generate heightmap in world coordinates.
+      pixel_size: float defining size of each pixel in meters.
+    Returns:
+      heightmap: HxW float array of height (from lower z-bound) in meters.
+      colormap: HxWx3 uint8 array of backprojected color aligned with heightmap.
+      xyzmap: HxWx3 float array of XYZ points in world coordinates.
+    """
+    width = int(np.round((bounds[0, 1] - bounds[0, 0]) / pixel_size))
+    height = int(np.round((bounds[1, 1] - bounds[1, 0]) / pixel_size))
+    heightmap = np.zeros((height, width), dtype=np.float32)
+    colormap = np.zeros((height, width, colors.shape[-1]), dtype=np.uint8)
+    xyzmap = np.zeros((height, width, 3), dtype=np.float32)
+
+    # Filter out 3D points that are outside of the predefined bounds.
+    ix = (points[Ellipsis, 0] >= bounds[0, 0]) & (points[Ellipsis, 0] < bounds[0, 1])
+    iy = (points[Ellipsis, 1] >= bounds[1, 0]) & (points[Ellipsis, 1] < bounds[1, 1])
+    iz = (points[Ellipsis, 2] >= bounds[2, 0]) & (points[Ellipsis, 2] < bounds[2, 1])
+    valid = ix & iy & iz
+    points = points[valid]
+    colors = colors[valid]
+
+    # Sort 3D points by z-value, which works with array assignment to simulate
+    # z-buffering for rendering the heightmap image.
+    iz = np.argsort(points[:, -1])
+    points, colors = points[iz], colors[iz]
+    px = np.int32(np.floor((points[:, 0] - bounds[0, 0]) / pixel_size))
+    py = np.int32(np.floor((points[:, 1] - bounds[1, 0]) / pixel_size))
+    px = np.clip(px, 0, width - 1)
+    py = np.clip(py, 0, height - 1)
+    heightmap[py, px] = points[:, 2] - bounds[2, 0]
+    for c in range(colors.shape[-1]):
+      colormap[py, px, c] = colors[:, c]
+      xyzmap[py, px, c] = points[:, c]
+    colormap = colormap[::-1, :, :]  # Flip up-down.
+    xv, yv = np.meshgrid(np.linspace(BOUNDS[0, 0], BOUNDS[0, 1], height),
+                         np.linspace(BOUNDS[1, 0], BOUNDS[1, 1], width))
+    xyzmap[:, :, 0] = xv
+    xyzmap[:, :, 1] = yv
+    xyzmap = xyzmap[::-1, :, :]  # Flip up-down.
+    heightmap = heightmap[::-1, :]  # Flip up-down.
+    return heightmap, colormap, xyzmap
\ No newline at end of file
diff --git a/saycan/policy.py b/saycan/policy.py
new file mode 100644
index 0000000..8b89a47
--- /dev/null
+++ b/saycan/policy.py
@@ -0,0 +1,61 @@
+"""
+SayCan Policy for SHARPIE.
+
+A simple policy wrapper for the SayCan environment. The actual LLM planning
+and CLIPort execution are handled by the environment module.
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., et al. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+
+class Policy:
+    """
+    SayCan-based policy for pick-and-place operations.
+
+    This policy passes participant inputs directly to the environment,
+    which handles LLM planning and CLIPort execution.
+    """
+
+    def __init__(self, room_name=""):
+        """
+        Initialize the SayCan policy.
+
+        Args:
+            room_name: Optional room identifier (unused, kept for compatibility)
+        """
+        self.name = "SayCan_Policy"
+        self.room_name = room_name
+
+    def predict(self, observation, participant_input=None):
+        """
+        Predict an action based on the observation.
+
+        Args:
+            observation: Current observation from the environment
+            participant_input: Text instruction from participant:
+                              - "task:<description>" to set task and auto-plan
+                              - "plan" to get next planned action
+                              - Direct text instruction for CLIPort
+
+        Returns:
+            The participant_input (passed through to environment)
+        """
+        return participant_input
+
+    def update(self, observation, action, reward, done, next_observation):
+        """
+        Update the policy based on experience (no-op for SayCan).
+
+        SayCan doesn't use traditional RL updates. This method is kept
+        for compatibility with the SHARPIE framework.
+        """
+        pass
+
+
+# Create an instance of the policy for use by the runner
+policy = Policy('saycan')
\ No newline at end of file
diff --git a/saycan/requirements.txt b/saycan/requirements.txt
new file mode 100644
index 0000000..a822930
--- /dev/null
+++ b/saycan/requirements.txt
@@ -0,0 +1,41 @@
+# Text processing utilities
+ftfy
+regex
+tqdm
+fvcore
+
+# OpenAI CLIP (install from git)
+git+https://github.com/openai/CLIP.git
+
+# Google Drive downloader
+gdown
+
+# Video and image processing
+moviepy
+imageio
+imageio-ffmpeg
+opencv-python
+pillow
+
+# Plotting and display
+matplotlib
+ipython
+
+# Robotics simulation and utilities
+pybullet
+ollama
+easydict
+
+# Deep learning frameworks
+tensorflow
+torch
+torchvision
+
+# JAX with CUDA support
+jax[cuda]
+flax
+optax
+
+# Numerical computing
+numpy
+scipy
\ No newline at end of file
diff --git a/saycan/robot.py b/saycan/robot.py
new file mode 100644
index 0000000..0eb827d
--- /dev/null
+++ b/saycan/robot.py
@@ -0,0 +1,161 @@
+"""
+Robotiq 2F-85 Gripper Control Module.
+
+This module provides control for the Robotiq 2F-85 parallel gripper in PyBullet
+simulation. The gripper is commonly used with UR5e robot arm in manipulation tasks.
+
+Key Features:
+- Gripper open/close control
+- Grasp detection
+- Thread-based constraint enforcement
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., et al. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+import os
+import threading
+import time
+import numpy as np
+import pybullet
+
+# Get the saycan directory for asset paths
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class Robotiq2F85:
+    """
+    Gripper handling for Robotiq 2F-85.
+
+    This class manages the gripper attached to a robot arm, providing
+    open/close functionality and grasp detection.
+
+    Attributes:
+        robot: PyBullet robot body ID
+        tool: Link ID of the robot's tool (end effector)
+        body: PyBullet gripper body ID
+        n_joints: Number of joints in the gripper
+        activated: Whether the gripper is currently activated (grasping)
+    """
+
+    def __init__(self, robot, tool):
+        """
+        Initialize the gripper.
+
+        Args:
+            robot: PyBullet body ID of the robot
+            tool: Link ID of the robot's end effector
+        """
+        self.robot = robot
+        self.tool = tool
+        pos = [0.1339999999999999, -0.49199999999872496, 0.5]
+        rot = pybullet.getQuaternionFromEuler([np.pi, 0, np.pi])
+        urdf = os.path.join(SAYCAN_DIR, "robotiq_2f_85", "robotiq_2f_85.urdf")
+        self.body = pybullet.loadURDF(urdf, pos, rot)
+        self.n_joints = pybullet.getNumJoints(self.body)
+        self.activated = False
+
+        # Connect gripper base to robot tool
+        pybullet.createConstraint(
+            self.robot, tool, self.body, 0,
+            jointType=pybullet.JOINT_FIXED,
+            jointAxis=[0, 0, 0],
+            parentFramePosition=[0, 0, 0],
+            childFramePosition=[0, 0, -0.07],
+            childFrameOrientation=pybullet.getQuaternionFromEuler([0, 0, np.pi / 2])
+        )
+
+        # Set friction coefficients for gripper fingers
+        for i in range(pybullet.getNumJoints(self.body)):
+            pybullet.changeDynamics(
+                self.body, i,
+                lateralFriction=10.0,
+                spinningFriction=1.0,
+                rollingFriction=1.0,
+                frictionAnchor=True
+            )
+
+        # Start thread to handle additional gripper constraints
+        self.motor_joint = 1
+        self.constraints_thread = threading.Thread(target=self.step)
+        self.constraints_thread.daemon = True
+        self.constraints_thread.start()
+
+    def step(self):
+        """Control joint positions by enforcing hard constraints on gripper behavior."""
+        while True:
+            try:
+                currj = [pybullet.getJointState(self.body, i)[0] for i in range(self.n_joints)]
+                indj = [6, 3, 8, 5, 10]
+                targj = [currj[1], -currj[1], -currj[1], currj[1], currj[1]]
+                pybullet.setJointMotorControlArray(
+                    self.body, indj, pybullet.POSITION_CONTROL, targj,
+                    positionGains=np.ones(5)
+                )
+            except:
+                return
+            time.sleep(0.001)
+
+    def activate(self):
+        """Activate the gripper (close fingers to grasp)."""
+        pybullet.setJointMotorControl2(
+            self.body, self.motor_joint,
+            pybullet.VELOCITY_CONTROL,
+            targetVelocity=1,
+            force=10
+        )
+        self.activated = True
+
+    def release(self):
+        """Release the gripper (open fingers)."""
+        pybullet.setJointMotorControl2(
+            self.body, self.motor_joint,
+            pybullet.VELOCITY_CONTROL,
+            targetVelocity=-1,
+            force=10
+        )
+        self.activated = False
+
+    def detect_contact(self):
+        obj, _, ray_frac = self.check_proximity()
+        if self.activated:
+            empty = self.grasp_width() < 0.01
+            cbody = self.body if empty else obj
+            if obj == self.body or obj == 0:
+                return False
+            return self.external_contact(cbody)
+        #   else:
+        #     return ray_frac < 0.14 or self.external_contact()
+
+    # Return if body is in contact with something other than gripper
+    def external_contact(self, body=None):
+        if body is None:
+            body = self.body
+        pts = pybullet.getContactPoints(bodyA=body)
+        pts = [pt for pt in pts if pt[2] != self.body]
+        return len(pts) > 0  # pylint: disable=g-explicit-length-test
+
+    def check_grasp(self):
+        while self.moving():
+            time.sleep(0.001)
+        success = self.grasp_width() > 0.01
+        return success
+
+    def grasp_width(self):
+        lpad = np.array(pybullet.getLinkState(self.body, 4)[0])
+        rpad = np.array(pybullet.getLinkState(self.body, 9)[0])
+        dist = np.linalg.norm(lpad - rpad) - 0.047813
+        return dist
+
+    def check_proximity(self):
+        ee_pos = np.array(pybullet.getLinkState(self.robot, self.tool)[0])
+        tool_pos = np.array(pybullet.getLinkState(self.body, 0)[0])
+        vec = (tool_pos - ee_pos) / np.linalg.norm((tool_pos - ee_pos))
+        ee_targ = ee_pos + vec
+        ray_data = pybullet.rayTest(ee_pos, ee_targ)[0]
+        obj, link, ray_frac = ray_data[0], ray_data[1], ray_data[2]
+        return obj, link, ray_frac
\ No newline at end of file
diff --git a/saycan/vild.py b/saycan/vild.py
new file mode 100644
index 0000000..d08b3f6
--- /dev/null
+++ b/saycan/vild.py
@@ -0,0 +1,624 @@
+"""
+ViLD - Vision and Language Knowledge Distillation for Open-Vocabulary Object Detection.
+
+This module provides the ViLD (Vision-Language Detection) model for open-vocabulary
+object detection. ViLD enables detecting objects beyond a fixed set of categories
+by leveraging CLIP embeddings.
+
+Key Components:
+- Text embedding building with prompt engineering
+- Object detection with confidence scoring
+- Visualization of detection results
+
+Original ViLD Repository:
+    https://github.com/tensorflow/tpu/tree/master/models/official/detection/projects/vild
+
+Reference:
+    Gu, X., Lin, T., Kuo, C., & Cui, Y. (2021). Open-Vocabulary Object Detection
+    via Vision and Language Knowledge Distillation. arXiv preprint arXiv:2104.13921.
+
+Used in SayCan:
+    https://github.com/google-research/google-research/tree/master/saycan
+"""
+
+import os
+import collections
+import numpy as np
+import cv2
+import torch
+import clip
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from easydict import EasyDict
+from PIL import Image
+import tensorflow.compat.v1 as tf
+
+# Get the directory where this script is located
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def softmax(x, axis=-1):
+    """Compute softmax values for each element in x."""
+    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
+    return e_x / np.sum(e_x, axis=axis, keepdims=True)
+
+
+# ViLD configuration flags
+FLAGS = {
+    'prompt_engineering': True,
+    'this_is': True,
+    'temperature': 100.0,
+    'use_softmax': False,
+}
+FLAGS = EasyDict(FLAGS)
+
+# Visualization parameters
+display_input_size = (10, 10)
+overall_fig_size = (18, 24)
+line_thickness = 1
+fig_size_w = 35
+mask_color = 'red'
+alpha = 0.5
+
+
+def article(name):
+    """Return 'an' if name starts with a vowel, 'a' otherwise."""
+    return "an" if name[0] in "aeiou" else "a"
+
+
+def processed_name(name, rm_dot=False):
+    """Process category name by replacing underscores and slashes."""
+    res = name.replace("_", " ").replace("/", " or ").lower()
+    if rm_dot:
+        res = res.rstrip(".")
+    return res
+
+
+# Prompt templates for CLIP embedding
+single_template = ["a photo of {article} {}."]
+
+multiple_templates = [
+    'There is {article} {} in the scene.',
+    'There is the {} in the scene.',
+    'a photo of {article} {} in the scene.',
+    'a photo of the {} in the scene.',
+    'a photo of one {} in the scene.',
+    'itap of {article} {}.',
+    'itap of my {}.',
+    'itap of the {}.',
+    'a photo of {article} {}.',
+    'a photo of my {}.',
+    'a photo of the {}.',
+    'a photo of one {}.',
+    'a photo of many {}.',
+    'a good photo of {article} {}.',
+    'a good photo of the {}.',
+    'a bad photo of {article} {}.',
+    'a bad photo of the {}.',
+    'a photo of a nice {}.',
+    'a photo of the nice {}.',
+    'a photo of a cool {}.',
+    'a photo of the cool {}.',
+    'a photo of a weird {}.',
+    'a photo of the weird {}.',
+    'a photo of a small {}.',
+    'a photo of the small {}.',
+    'a photo of a large {}.',
+    'a photo of the large {}.',
+    'a photo of a clean {}.',
+    'a photo of the clean {}.',
+    'a photo of a dirty {}.',
+    'a photo of the dirty {}.',
+    'a bright photo of {article} {}.',
+    'a bright photo of the {}.',
+    'a dark photo of {article} {}.',
+    'a dark photo of the {}.',
+    'a photo of a hard to see {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of {article} {}.',
+    'a low resolution photo of the {}.',
+    'a cropped photo of {article} {}.',
+    'a cropped photo of the {}.',
+    'a close-up photo of {article} {}.',
+    'a close-up photo of the {}.',
+    'a jpeg corrupted photo of {article} {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a blurry photo of {article} {}.',
+    'a blurry photo of the {}.',
+    'a pixelated photo of {article} {}.',
+    'a pixelated photo of the {}.',
+    'a black and white photo of the {}.',
+    'a black and white photo of {article} {}.',
+    'a plastic {}.',
+    'the plastic {}.',
+    'a toy {}.',
+    'the toy {}.',
+    'a plushie {}.',
+    'the plushie {}.',
+    'a cartoon {}.',
+    'the cartoon {}.',
+    'an embroidered {}.',
+    'the embroidered {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+]
+
+# Load CLIP model
+clip_model, clip_preprocess = clip.load("ViT-B/32")
+if torch.cuda.is_available():
+    clip_model.cuda()
+clip_model.eval()
+print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in clip_model.parameters()]):,}")
+print("Input resolution:", clip_model.visual.input_resolution)
+print("Context length:", clip_model.context_length)
+print("Vocab size:", clip_model.vocab_size)
+
+
+def build_text_embedding(categories):
+    """
+    Build text embeddings for object categories using CLIP.
+
+    Args:
+        categories: List of category dicts with 'name' and 'id' keys
+
+    Returns:
+        Numpy array of text embeddings
+    """
+    if FLAGS.prompt_engineering:
+        templates = multiple_templates
+    else:
+        templates = single_template
+
+    run_on_gpu = torch.cuda.is_available()
+
+    with torch.no_grad():
+        all_text_embeddings = []
+        print("Building text embeddings...")
+        for category in tqdm(categories):
+            texts = [
+                template.format(processed_name(category["name"], rm_dot=True),
+                                article=article(category["name"]))
+                for template in templates
+            ]
+            if FLAGS.this_is:
+                texts = [
+                    "This is " + text if text.startswith("a") or text.startswith("the") else text
+                    for text in texts
+                ]
+            texts = clip.tokenize(texts)
+            if run_on_gpu:
+                texts = texts.cuda()
+            text_embeddings = clip_model.encode_text(texts)
+            text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
+            text_embedding = text_embeddings.mean(dim=0)
+            text_embedding /= text_embedding.norm()
+            all_text_embeddings.append(text_embedding)
+        all_text_embeddings = torch.stack(all_text_embeddings, dim=1)
+        if run_on_gpu:
+            all_text_embeddings = all_text_embeddings.cuda()
+    return all_text_embeddings.cpu().numpy().T
+
+
+# Load ViLD TensorFlow model
+config = tf.ConfigProto(allow_soft_placement=True)
+if torch.cuda.is_available():
+    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
+    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
+session = tf.Session(graph=tf.Graph(), config=config)
+saved_model_dir = os.path.join(SAYCAN_DIR, "image_path_v2")
+_ = tf.saved_model.loader.load(session, ["serve"], saved_model_dir)
+
+numbered_categories = [{"name": str(idx), "id": idx} for idx in range(50)]
+numbered_category_indices = {cat["id"]: cat for cat in numbered_categories}
+
+
+def nms(dets, scores, thresh, max_dets=1000):
+    """
+    Non-maximum suppression.
+
+    Args:
+        dets: Detection boxes [N, 4]
+        scores: Detection scores [N,]
+        thresh: IoU threshold
+        max_dets: Maximum detections to keep
+
+    Returns:
+        List of indices to keep
+    """
+    y1 = dets[:, 0]
+    x1 = dets[:, 1]
+    y2 = dets[:, 2]
+    x2 = dets[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0 and len(keep) < max_dets:
+        i = order[0]
+        keep.append(i)
+
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        intersection = w * h
+        overlap = intersection / (areas[i] + areas[order[1:]] - intersection + 1e-12)
+
+        inds = np.where(overlap <= thresh)[0]
+        order = order[inds + 1]
+    return keep
+
+
+import PIL.ImageColor as ImageColor
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+
+STANDARD_COLORS = ["White"]
+
+
+def draw_bounding_box_on_image(image, ymin, xmin, ymax, xmax, color="red", thickness=4,
+                                display_str_list=(), use_normalized_coordinates=True):
+    """Adds a bounding box to an image."""
+    draw = ImageDraw.Draw(image)
+    im_width, im_height = image.size
+    if use_normalized_coordinates:
+        (left, right, top, bottom) = (
+            xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height
+        )
+    else:
+        (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+    draw.line([(left, top), (left, bottom), (right, bottom), (right, top), (left, top)],
+              width=thickness, fill=color)
+    try:
+        font = ImageFont.truetype("arial.ttf", 24)
+    except IOError:
+        font = ImageFont.load_default()
+
+    display_str_heights = [font.getbbox(ds)[3] - font.getbbox(ds)[1] for ds in display_str_list]
+    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
+
+    if top > total_display_str_height:
+        text_bottom = top
+    else:
+        text_bottom = bottom + total_display_str_height
+
+    for display_str in display_str_list[::-1]:
+        text_left = min(5, left)
+        bbox = font.getbbox(display_str)
+        text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
+        margin = np.ceil(0.05 * text_height)
+        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
+                        (left + text_width, text_bottom)], fill=color)
+        draw.text((left + margin, text_bottom - text_height - margin), display_str,
+                  fill="black", font=font)
+        text_bottom -= text_height - 2 * margin
+
+
+def draw_bounding_box_on_image_array(image, ymin, xmin, ymax, xmax, color="red", thickness=4,
+                                      display_str_list=(), use_normalized_coordinates=True):
+    """Adds a bounding box to an image (numpy array)."""
+    image_pil = Image.fromarray(np.uint8(image)).convert("RGB")
+    draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color, thickness,
+                                display_str_list, use_normalized_coordinates)
+    np.copyto(image, np.array(image_pil))
+
+
+def draw_mask_on_image_array(image, mask, color="red", alpha=0.4):
+    """Draws mask on an image."""
+    if image.dtype != np.uint8:
+        raise ValueError("`image` not of type np.uint8")
+    if mask.dtype != np.uint8:
+        raise ValueError("`mask` not of type np.uint8")
+    if np.any(np.logical_and(mask != 1, mask != 0)):
+        raise ValueError("`mask` elements should be in [0, 1]")
+    if image.shape[:2] != mask.shape:
+        raise ValueError("Image and mask dimensions don't match")
+
+    rgb = ImageColor.getrgb(color)
+    pil_image = Image.fromarray(image)
+    solid_color = np.expand_dims(np.ones_like(mask), axis=2) * np.reshape(list(rgb), [1, 1, 3])
+    pil_solid_color = Image.fromarray(np.uint8(solid_color)).convert("RGBA")
+    pil_mask = Image.fromarray(np.uint8(255.0 * alpha * mask)).convert("L")
+    pil_image = Image.composite(pil_solid_color, pil_image, pil_mask)
+    np.copyto(image, np.array(pil_image.convert("RGB")))
+
+
+def visualize_boxes_and_labels_on_image_array(image, boxes, classes, scores, category_index,
+                                               instance_masks=None, instance_boundaries=None,
+                                               use_normalized_coordinates=False, max_boxes_to_draw=20,
+                                               min_score_thresh=0.5, agnostic_mode=False,
+                                               line_thickness=1, groundtruth_box_visualization_color="black",
+                                               skip_scores=False, skip_labels=False, mask_alpha=0.4,
+                                               plot_color=None):
+    """Overlay labeled boxes on an image with formatted scores and label names."""
+    box_to_display_str_map = collections.defaultdict(list)
+    box_to_color_map = collections.defaultdict(str)
+    box_to_instance_masks_map = {}
+    box_to_score_map = {}
+    box_to_instance_boundaries_map = {}
+
+    if not max_boxes_to_draw:
+        max_boxes_to_draw = boxes.shape[0]
+
+    for i in range(min(max_boxes_to_draw, boxes.shape[0])):
+        if scores is None or scores[i] > min_score_thresh:
+            box = tuple(boxes[i].tolist())
+            if instance_masks is not None:
+                box_to_instance_masks_map[box] = instance_masks[i]
+            if instance_boundaries is not None:
+                box_to_instance_boundaries_map[box] = instance_boundaries[i]
+            if scores is None:
+                box_to_color_map[box] = groundtruth_box_visualization_color
+            else:
+                display_str = ""
+                if not skip_labels:
+                    if not agnostic_mode:
+                        if classes[i] in list(category_index.keys()):
+                            class_name = category_index[classes[i]]["name"]
+                        else:
+                            class_name = "N/A"
+                        display_str = str(class_name)
+                if not skip_scores:
+                    if not display_str:
+                        display_str = "{}%".format(int(100 * scores[i]))
+                    else:
+                        float_score = ("%.2f" % scores[i]).lstrip("0")
+                        display_str = "{}: {}".format(display_str, float_score)
+                    box_to_score_map[box] = int(100 * scores[i])
+
+                box_to_display_str_map[box].append(display_str)
+                if plot_color is not None:
+                    box_to_color_map[box] = plot_color
+                elif agnostic_mode:
+                    box_to_color_map[box] = "DarkOrange"
+                else:
+                    box_to_color_map[box] = STANDARD_COLORS[classes[i] % len(STANDARD_COLORS)]
+
+    if box_to_score_map:
+        box_color_iter = sorted(box_to_color_map.items(), key=lambda kv: box_to_score_map[kv[0]])
+    else:
+        box_color_iter = box_to_color_map.items()
+
+    for box, color in box_color_iter:
+        ymin, xmin, ymax, xmax = box
+        if instance_masks is not None:
+            draw_mask_on_image_array(image, box_to_instance_masks_map[box], color=color, alpha=mask_alpha)
+        if instance_boundaries is not None:
+            draw_mask_on_image_array(image, box_to_instance_boundaries_map[box], color="red", alpha=1.0)
+        draw_bounding_box_on_image_array(image, ymin, xmin, ymax, xmax, color=color,
+                                          thickness=line_thickness,
+                                          display_str_list=box_to_display_str_map[box],
+                                          use_normalized_coordinates=use_normalized_coordinates)
+
+    return image
+
+
+def paste_instance_masks(masks, detected_boxes, image_height, image_width):
+    """Paste instance masks to generate the image segmentation results."""
+    def expand_boxes(boxes, scale):
+        w_half = boxes[:, 2] * 0.5
+        h_half = boxes[:, 3] * 0.5
+        x_c = boxes[:, 0] + w_half
+        y_c = boxes[:, 1] + h_half
+        w_half *= scale
+        h_half *= scale
+        boxes_exp = np.zeros(boxes.shape)
+        boxes_exp[:, 0] = x_c - w_half
+        boxes_exp[:, 2] = x_c + w_half
+        boxes_exp[:, 1] = y_c - h_half
+        boxes_exp[:, 3] = y_c + h_half
+        return boxes_exp
+
+    _, mask_height, mask_width = masks.shape
+    scale = max((mask_width + 2.0) / mask_width, (mask_height + 2.0) / mask_height)
+    ref_boxes = expand_boxes(detected_boxes, scale)
+    ref_boxes = ref_boxes.astype(np.int32)
+    padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+    segms = []
+
+    for mask_ind, mask in enumerate(masks):
+        im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+        padded_mask[1:-1, 1:-1] = mask[:, :]
+        ref_box = ref_boxes[mask_ind, :]
+        w = ref_box[2] - ref_box[0] + 1
+        h = ref_box[3] - ref_box[1] + 1
+        w = np.maximum(w, 1)
+        h = np.maximum(h, 1)
+        mask = cv2.resize(padded_mask, (w, h))
+        mask = np.array(mask > 0.5, dtype=np.uint8)
+        x_0 = min(max(ref_box[0], 0), image_width)
+        x_1 = min(max(ref_box[2] + 1, 0), image_width)
+        y_0 = min(max(ref_box[1], 0), image_height)
+        y_1 = min(max(ref_box[3] + 1, 0), image_height)
+        im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]),
+                                            (x_0 - ref_box[0]):(x_1 - ref_box[0])]
+        segms.append(im_mask)
+
+    segms = np.array(segms)
+    return segms
+
+
+def plot_mask(color, alpha, original_image, mask):
+    """Plot instance mask on image."""
+    rgb = ImageColor.getrgb(color)
+    pil_image = Image.fromarray(original_image)
+    solid_color = np.expand_dims(np.ones_like(mask), axis=2) * np.reshape(list(rgb), [1, 1, 3])
+    pil_solid_color = Image.fromarray(np.uint8(solid_color)).convert("RGBA")
+    pil_mask = Image.fromarray(np.uint8(255.0 * alpha * mask)).convert("L")
+    pil_image = Image.composite(pil_solid_color, pil_image, pil_mask)
+    return np.array(pil_image.convert("RGB"))
+
+
+def display_image(path_or_array, size=(10, 10)):
+    """Display an image from path or array."""
+    if isinstance(path_or_array, str):
+        image = np.asarray(Image.open(open(path_or_array, "rb")).convert("RGB"))
+    else:
+        image = path_or_array
+    plt.figure(figsize=size)
+    plt.imshow(image)
+    plt.axis("off")
+    plt.show()
+
+
+def vild(image_path, category_name_string, params, plot_on=True, prompt_swaps=[]):
+    """
+    Run ViLD object detection on an image.
+
+    Args:
+        image_path: Path to the input image
+        category_name_string: Semicolon-separated category names
+        params: Tuple of (max_boxes, nms_thresh, min_rpn_score, min_box_area, max_box_area)
+        plot_on: Whether to display visualization
+        prompt_swaps: List of (old, new) string replacements for categories
+
+    Returns:
+        List of detected object names
+    """
+    # Preprocessing categories
+    for a, b in prompt_swaps:
+        category_name_string = category_name_string.replace(a, b)
+    category_names = [x.strip() for x in category_name_string.split(";")]
+    category_names = ["background"] + category_names
+    categories = [{"name": item, "id": idx + 1} for idx, item in enumerate(category_names)]
+    category_indices = {cat["id"]: cat for cat in categories}
+
+    max_boxes_to_draw, nms_threshold, min_rpn_score_thresh, min_box_area, max_box_area = params
+    fig_size_h = min(max(5, int(len(category_names) / 2.5)), 10)
+
+    # Run ViLD model
+    roi_boxes, roi_scores, detection_boxes, scores_unused, box_outputs, detection_masks, visual_features, image_info = session.run(
+        ["RoiBoxes:0", "RoiScores:0", "2ndStageBoxes:0", "2ndStageScoresUnused:0",
+         "BoxOutputs:0", "MaskOutputs:0", "VisualFeatOutputs:0", "ImageInfo:0"],
+        feed_dict={"Placeholder:0": [image_path]})
+
+    roi_boxes = np.squeeze(roi_boxes, axis=0)
+    roi_scores = np.squeeze(roi_scores, axis=0)
+    detection_boxes = np.squeeze(detection_boxes, axis=(0, 2))
+    scores_unused = np.squeeze(scores_unused, axis=0)
+    box_outputs = np.squeeze(box_outputs, axis=0)
+    detection_masks = np.squeeze(detection_masks, axis=0)
+    visual_features = np.squeeze(visual_features, axis=0)
+
+    image_info = np.squeeze(image_info, axis=0)
+    image_scale = np.tile(image_info[2:3, :], (1, 2))
+    image_height = int(image_info[0, 0])
+    image_width = int(image_info[0, 1])
+
+    rescaled_detection_boxes = detection_boxes / image_scale
+
+    # Read image
+    image = np.asarray(Image.open(open(image_path, "rb")).convert("RGB"))
+    assert image_height == image.shape[0]
+    assert image_width == image.shape[1]
+
+    # Filter boxes with NMS
+    nmsed_indices = nms(detection_boxes, roi_scores, thresh=nms_threshold)
+    box_sizes = (rescaled_detection_boxes[:, 2] - rescaled_detection_boxes[:, 0]) * \
+                (rescaled_detection_boxes[:, 3] - rescaled_detection_boxes[:, 1])
+
+    valid_indices = np.where(
+        np.logical_and(
+            np.isin(np.arange(len(roi_scores), dtype=int), nmsed_indices),
+            np.logical_and(
+                np.logical_not(np.all(roi_boxes == 0., axis=-1)),
+                np.logical_and(
+                    roi_scores >= min_rpn_score_thresh,
+                    np.logical_and(box_sizes > min_box_area, box_sizes < max_box_area)
+                )
+            )
+        )
+    )[0]
+
+    detection_roi_scores = roi_scores[valid_indices][:max_boxes_to_draw, ...]
+    detection_boxes = detection_boxes[valid_indices][:max_boxes_to_draw, ...]
+    detection_masks = detection_masks[valid_indices][:max_boxes_to_draw, ...]
+    detection_visual_feat = visual_features[valid_indices][:max_boxes_to_draw, ...]
+    rescaled_detection_boxes = rescaled_detection_boxes[valid_indices][:max_boxes_to_draw, ...]
+
+    # Compute text embeddings and scores
+    text_features = build_text_embedding(categories)
+    raw_scores = detection_visual_feat.dot(text_features.T)
+    if FLAGS.use_softmax:
+        scores_all = softmax(FLAGS.temperature * raw_scores, axis=-1)
+    else:
+        scores_all = raw_scores
+
+    indices = np.argsort(-np.max(scores_all, axis=1))
+    indices_fg = np.array([i for i in indices if np.argmax(scores_all[i]) != 0])
+
+    # Get found objects
+    found_objects = []
+    for a, b in prompt_swaps:
+        category_names = [name.replace(b, a) for name in category_names]
+
+    for anno_idx in indices[0:int(rescaled_detection_boxes.shape[0])]:
+        scores = scores_all[anno_idx]
+        if np.argmax(scores) == 0:
+            continue
+        found_object = category_names[np.argmax(scores)]
+        if found_object == "background":
+            continue
+        print("Found a", found_object, "with score:", np.max(scores))
+        found_objects.append(category_names[np.argmax(scores)])
+
+    if not plot_on:
+        return found_objects
+
+    # Visualization
+    ymin, xmin, ymax, xmax = np.split(rescaled_detection_boxes, 4, axis=-1)
+    processed_boxes = np.concatenate([xmin, ymin, xmax - xmin, ymax - ymin], axis=-1)
+    segmentations = paste_instance_masks(detection_masks, processed_boxes, image_height, image_width)
+
+    if len(indices_fg) == 0:
+        display_image(np.array(image), size=overall_fig_size)
+        print("ViLD does not detect anything belonging to the given category")
+    else:
+        image_with_detections = visualize_boxes_and_labels_on_image_array(
+            np.array(image),
+            rescaled_detection_boxes[indices_fg],
+            valid_indices[:max_boxes_to_draw][indices_fg],
+            detection_roi_scores[indices_fg],
+            numbered_category_indices,
+            instance_masks=segmentations[indices_fg],
+            use_normalized_coordinates=False,
+            max_boxes_to_draw=max_boxes_to_draw,
+            min_score_thresh=min_rpn_score_thresh,
+            skip_scores=False,
+            skip_labels=True)
+
+        plt.imshow(image_with_detections)
+        plt.title("ViLD detected objects and RPN scores.")
+        plt.show()
+
+    return found_objects
+
+
+# Default category names for pick-and-place tasks
+category_names = [
+    'blue block', 'red block', 'green block', 'orange block', 'yellow block',
+    'purple block', 'pink block', 'cyan block', 'brown block', 'gray block',
+    'blue bowl', 'red bowl', 'green bowl', 'orange bowl', 'yellow bowl',
+    'purple bowl', 'pink bowl', 'cyan bowl', 'brown bowl', 'gray bowl'
+]
+
+image_path = 'tmp.jpg'
+
+# ViLD settings
+category_name_string = ";".join(category_names)
+max_boxes_to_draw = 8
+prompt_swaps = [('block', 'cube')]
+nms_threshold = 0.4
+min_rpn_score_thresh = 0.4
+min_box_area = 10
+max_box_area = 3000
+vild_params = max_boxes_to_draw, nms_threshold, min_rpn_score_thresh, min_box_area, max_box_area
+
+
+if __name__ == "__main__":
+    found_objects = vild(image_path, category_name_string, vild_params, plot_on=True, prompt_swaps=prompt_swaps)
+    print("Found objects:", found_objects)
\ No newline at end of file

From 8d7b14c5a6ac6c77040c95fd579376f224d166d6 Mon Sep 17 00:00:00 2001
From: Goncalves Braz <l.b.goncalvesbraz@uu.nl>
Date: Mon, 23 Feb 2026 22:34:52 +0100
Subject: [PATCH 2/4] Auto-run tasks, add ViLD lazy loading, remove demo

Make environment and model behavior more robust and production-friendly:

- README: Update example defaults (wait_for_inputs -> False, inputs_type -> 'other').
- cliport.py: Remove interactive/demo run_cliport helper to avoid shipping debug UI code.
- config.py: Call download_assets() on import rather than only when executed as __main__.
- environment.py:
  - Add cv2 import and convert cached frames / camera images to RGB in render().
  - Replace ad-hoc 'task:'/plan handling with run_task(), which sets a task and automatically plans+executes steps up to a limit, returning a summary (steps, total_reward, termination_reason) and caching video frames for rendering.
  - Adjust step handling to return run_task results for 'task:' actions.
- vild.py:
  - Replace eager CLIP/TF loading with lazy getters (get_clip_model, get_tf_session) and a cleanup_models() helper to free resources.
  - Add VILD_LAZY_LOAD env var to opt into lazy loading; otherwise behavior remains backward-compatible.
  - Update build_text_embedding to obtain the CLIP model lazily, move embeddings to CPU promptly, and clear GPU cache after use to reduce memory spike.

Has been tested on an Nvidia A16 but ViLD still crashes when running multiple times
---
 saycan/README.md      |  4 +--
 saycan/cliport.py     | 73 --------------------------------------
 saycan/config.py      |  4 +--
 saycan/environment.py | 82 ++++++++++++++++++++++++++++++++-----------
 saycan/vild.py        | 80 ++++++++++++++++++++++++++++++++++-------
 5 files changed, 133 insertions(+), 110 deletions(-)

diff --git a/saycan/README.md b/saycan/README.md
index 3911187..740c593 100644
--- a/saycan/README.md
+++ b/saycan/README.md
@@ -43,7 +43,7 @@ python manage.py shell -c "from experiment.models import Experiment, Environment
     'environment': Environment.objects.get(name='SayCan'),
     'number_of_episodes': 1,
     'target_fps': 24.0,
-    'wait_for_inputs': True
+    'wait_for_inputs': False
 })"
 ```
 
@@ -65,7 +65,7 @@ python manage.py shell -c "from experiment.models import Agent, Policy; Agent.ob
     'participant': True,
     'keyboard_inputs': {},
     'multiple_keyboard_inputs': False,
-    'inputs_type': 'actions',
+    'inputs_type': 'other',
     'textual_inputs': True
 })"
 ```
diff --git a/saycan/cliport.py b/saycan/cliport.py
index c7bec74..6cfde29 100644
--- a/saycan/cliport.py
+++ b/saycan/cliport.py
@@ -312,79 +312,6 @@ class TrainState(train_state.TrainState):
 
 
 
-user_input = 'Pick the yellow block and place it on the blue bowl.'  #@param {type:"string"}
-
-# Show camera image before pick and place.
-
-def run_cliport(obs, text, env=None, clip_model=None, coords=None, optim=None, eval_step_fn=None):
-  before = env.get_camera_image()
-  prev_obs = obs['image'].copy()
-
-  # Tokenize text and get CLIP features.
-  text_tokens = clip.tokenize(text).cuda()
-  with torch.no_grad():
-    text_feats = clip_model.encode_text(text_tokens).float()
-  text_feats /= text_feats.norm(dim=-1, keepdim=True)
-  text_feats = np.float32(text_feats.cpu())
-
-  # Normalize image and add batch dimension.
-  img = obs['image'][None, ...] / 255
-  img = np.concatenate((img, coords[None, ...]), axis=3)
-
-  # Run Transporter Nets to get pick and place heatmaps.
-  batch = {'img': jnp.float32(img), 'text': jnp.float32(text_feats)}
-  pick_map, place_map = eval_step_fn(optim, batch)
-  pick_map, place_map = np.float32(pick_map), np.float32(place_map)
-
-  # Get pick position.
-  pick_max = np.argmax(np.float32(pick_map)).squeeze()
-  pick_yx = (pick_max // 224, pick_max % 224)
-  pick_yx = np.clip(pick_yx, 20, 204)
-  pick_xyz = obs['xyzmap'][pick_yx[0], pick_yx[1]]
-
-  # Get place position.
-  place_max = np.argmax(np.float32(place_map)).squeeze()
-  place_yx = (place_max // 224, place_max % 224)
-  place_yx = np.clip(place_yx, 20, 204)
-  place_xyz = obs['xyzmap'][place_yx[0], place_yx[1]]
-
-  # Step environment.
-  act = {'pick': pick_xyz, 'place': place_xyz}
-  obs, _, _, _ = env.step(act)
-
-  # Show pick and place action.
-  plt.title(text)
-  plt.imshow(prev_obs)
-  plt.arrow(pick_yx[1], pick_yx[0], place_yx[1]-pick_yx[1], place_yx[0]-pick_yx[0], color='w', head_starts_at_zero=False, head_width=7, length_includes_head=True)
-  plt.show()
-
-  # Show debug plots.
-  plt.subplot(1, 2, 1)
-  plt.title('Pick Heatmap')
-  plt.imshow(pick_map.reshape(224, 224))
-  plt.subplot(1, 2, 2)
-  plt.title('Place Heatmap')
-  plt.imshow(place_map.reshape(224, 224))
-  plt.show()
-
-  # Show video of environment rollout.
-  debug_clip = ImageSequenceClip(env.cache_video, fps=25)
-  display(debug_clip.ipython_display(autoplay=1, loop=1, center=False))
-  env.cache_video = []
-
-  # Show camera image after pick and place.
-  plt.subplot(1, 2, 1)
-  plt.title('Before')
-  plt.imshow(before)
-  plt.subplot(1, 2, 2)
-  plt.title('After')
-  after = env.get_camera_image()
-  plt.imshow(after)
-  plt.show()
-
-  # return pick_xyz, place_xyz, pick_map, place_map, pick_yx, place_yx
-  return obs
-
 
 # ============================================================================
 # CLIPort Interface Class for easy integration
diff --git a/saycan/config.py b/saycan/config.py
index 837eb5d..43e64b8 100644
--- a/saycan/config.py
+++ b/saycan/config.py
@@ -95,9 +95,7 @@ def download_assets():
     finally:
         os.chdir(original_dir)
 
-# Call download_assets() only when this script is run directly, not when imported
-if __name__ == "__main__":
-    download_assets()
+download_assets()
 
 # =============================================================================
 # Global Constants
diff --git a/saycan/environment.py b/saycan/environment.py
index 6e276ae..7814c5c 100644
--- a/saycan/environment.py
+++ b/saycan/environment.py
@@ -24,6 +24,7 @@
     Robotic Affordances. arXiv preprint arXiv:2204.01691.
 """
 
+import cv2
 import os
 import sys
 import tempfile
@@ -238,25 +239,11 @@ def step(self, action_dict):
         if action == 'done':
             return np.array([]), 0.0, True, False, {"info": "Task completed"}
         elif isinstance(action, str) and action.startswith('task:'):
-            # Set task and plan first action
+            # Execute complete task automatically
             task_text = action[5:].strip()
-            self.set_task(task_text)
-            action_text, task_done = self.plan_next_action()
-            if task_done or action_text == "done":
-                return np.array([]), 0.0, False, False, {"info": "Task completed"}
-            action = action_text
-        elif action == 'plan':
-            # Get next planned action
-            if self._current_task is None:
-                return np.array([]), 0.0, False, False, {"info": "No task set"}
-            # Re-detect objects since they may have moved
-            self._found_objects = None
-            action_text, task_done = self.plan_next_action()
-            if task_done or action_text == "done":
-                return np.array([]), 0.0, False, False, {"info": "Task completed"}
-            action = action_text
-
-        if action and action != "done":
+            results = self.run_task(task_text)
+            return np.array([]), results["total_reward"], False, False, results
+        elif action:
             # Direct text instruction
             obs, reward, _, info = self._step_with_text(action)
             # Get the frames buffer
@@ -298,13 +285,68 @@ def _step_with_text(self, text):
     def render(self):
         """Render the environment."""
         if len(self.cached_video_frames) > 0:
-            return self.cached_video_frames.pop(0)
-        return self.env.get_camera_image()
+            return cv2.cvtColor(self.cached_video_frames.pop(0), cv2.COLOR_BGR2RGB)
+        return cv2.cvtColor(self.env.get_camera_image(), cv2.COLOR_BGR2RGB)
 
     def get_observation(self):
         """Get current observation without stepping."""
         return self.env.get_observation()
 
+    def run_task(self, task_text, max_steps=5):
+        """
+        Execute a complete task from start to finish with automatic planning.
+
+        This method sets the task and automatically executes all planned actions
+        until completion, without requiring manual 'plan' calls between steps.
+
+        Args:
+            task_text: Natural language task description (e.g., "put all blocks in bowls")
+            max_steps: Maximum number of actions to execute (default: 50)
+
+        Returns:
+            results: Dictionary containing:
+                - task: The original task text
+                - completed: Whether the task completed successfully
+                - steps: List of executed steps with actions and rewards
+                - total_reward: Cumulative reward across all steps
+                - termination_reason: Why execution stopped
+        """
+        self.set_task(task_text)
+
+        results = {
+            "task": task_text,
+            "completed": False,
+            "steps": [],
+            "total_reward": 0.0,
+            "termination_reason": None
+        }
+
+        for step in range(max_steps):
+            # Plan next action
+            action_text, task_done = self.plan_next_action()
+
+            # Check for task completion signal from LLM
+            if task_done or action_text == "done":
+                results["completed"] = True
+                results["termination_reason"] = "task_done"
+                break
+
+            # Execute the planned action
+            obs, reward, _, info = self._step_with_text(action_text)
+            results["total_reward"] += reward
+
+            results["steps"].append({
+                "step": step,
+                "action": action_text,
+                "reward": reward,
+                "info": info
+            })
+
+        # Cache final video frames for rendering
+        self.cached_video_frames = self.env.cache_video
+
+        return results
+
 
 # Create the environment instance for SHARPIE runner
 environment = EnvironmentWrapper()
\ No newline at end of file
diff --git a/saycan/vild.py b/saycan/vild.py
index d08b3f6..4bc018c 100644
--- a/saycan/vild.py
+++ b/saycan/vild.py
@@ -143,15 +143,66 @@ def processed_name(name, rm_dot=False):
     'a painting of a {}.',
 ]
 
-# Load CLIP model
-clip_model, clip_preprocess = clip.load("ViT-B/32")
-if torch.cuda.is_available():
-    clip_model.cuda()
-clip_model.eval()
-print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in clip_model.parameters()]):,}")
-print("Input resolution:", clip_model.visual.input_resolution)
-print("Context length:", clip_model.context_length)
-print("Vocab size:", clip_model.vocab_size)
+# Lazy-loaded models (loaded on first use, can be freed)
+_clip_model = None
+_clip_preprocess = None
+_tf_session = None
+
+
+def get_clip_model():
+    """Get or lazily load the CLIP model."""
+    global _clip_model, _clip_preprocess
+    if _clip_model is None:
+        _clip_model, _clip_preprocess = clip.load("ViT-B/32")
+        if torch.cuda.is_available():
+            _clip_model.cuda()
+        _clip_model.eval()
+        print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in _clip_model.parameters()]):,}")
+        print("Input resolution:", _clip_model.visual.input_resolution)
+        print("Context length:", _clip_model.context_length)
+        print("Vocab size:", _clip_model.vocab_size)
+    return _clip_model, _clip_preprocess
+
+
+def get_tf_session():
+    """Get or lazily load the TensorFlow session."""
+    global _tf_session
+    if _tf_session is None:
+        config = tf.ConfigProto(allow_soft_placement=True)
+        if torch.cuda.is_available():
+            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
+            config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
+        _tf_session = tf.Session(graph=tf.Graph(), config=config)
+        saved_model_dir = os.path.join(SAYCAN_DIR, "image_path_v2")
+        _ = tf.saved_model.loader.load(_tf_session, ["serve"], saved_model_dir)
+    return _tf_session
+
+
+def cleanup_models():
+    """Free model resources. Call this when done with ViLD."""
+    global _clip_model, _clip_preprocess, _tf_session
+
+    if _tf_session is not None:
+        _tf_session.close()
+        _tf_session = None
+
+    if _clip_model is not None:
+        del _clip_model
+        del _clip_preprocess
+        _clip_model = None
+        _clip_preprocess = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+
+# Backward compatibility - load on import (can be disabled by setting LAZY_LOAD=true)
+import os as _os
+if _os.environ.get('VILD_LAZY_LOAD', '').lower() != 'true':
+    clip_model, clip_preprocess = get_clip_model()
+    session = get_tf_session()
+else:
+    clip_model, clip_preprocess = None, None
+    session = None
 
 
 def build_text_embedding(categories):
@@ -164,6 +215,8 @@ def build_text_embedding(categories):
     Returns:
         Numpy array of text embeddings
     """
+    clip_model, _ = get_clip_model()
+
     if FLAGS.prompt_engineering:
         templates = multiple_templates
     else:
@@ -192,10 +245,13 @@ def build_text_embedding(categories):
             text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
             text_embedding = text_embeddings.mean(dim=0)
             text_embedding /= text_embedding.norm()
-            all_text_embeddings.append(text_embedding)
+            all_text_embeddings.append(text_embedding.cpu())  # Move to CPU immediately
         all_text_embeddings = torch.stack(all_text_embeddings, dim=1)
-        if run_on_gpu:
-            all_text_embeddings = all_text_embeddings.cuda()
+
+    # Clear GPU cache after embedding
+    if run_on_gpu:
+        torch.cuda.empty_cache()
+
     return all_text_embeddings.cpu().numpy().T
 
 

From bd7056443f1151e71d0aee43b03063890dccabc9 Mon Sep 17 00:00:00 2001
From: Floris den Hengst <florisdenhengst@gmail.com>
Date: Wed, 25 Feb 2026 20:48:57 +0100
Subject: [PATCH 3/4] Proposal to factor out base env and SHARPIE env wrapper

---
 saycan/base_environment.py | 348 +++++++++++++++++++++++++++++++++++++
 saycan/environment.py      | 301 +-------------------------------
 2 files changed, 352 insertions(+), 297 deletions(-)
 create mode 100644 saycan/base_environment.py

diff --git a/saycan/base_environment.py b/saycan/base_environment.py
new file mode 100644
index 0000000..b972b94
--- /dev/null
+++ b/saycan/base_environment.py
@@ -0,0 +1,348 @@
+"""
+SayCan Environment Wrapper for SHARPIE.
+
+This module wraps the PickPlaceEnv from the SayCan codebase to work with the
+SHARPIE experiment framework. It integrates:
+- ViLD for open-vocabulary object detection
+- LLM (via Ollama) for task planning and action scoring
+- CLIPort for language-conditioned pick-and-place manipulation
+
+Action Types:
+- "task:<description>" - Set task and auto-plan first action
+- "plan" - Get next planned action from LLM
+- "<text instruction>" - Direct CLIPort instruction
+- "done" - End episode
+
+Original SayCan Repository:
+    https://github.com/google-research/google-research/tree/master/saycan
+
+Reference:
+    Ahn, M., Brohan, A., Brown, N., Chebotar, Y., Cortes, Y., David, B.,
+    Finn, C., Fu, C., Gopalakrishnan, K., Hausman, K., Herzog, A., Ho, D.,
+    Hsu, J., Ibarz, J., Ichter, B., Irpan, A., Jang, E., Jang, R., Julian, R.,
+    ... & Zeng, A. (2022). Do As I Can, Not As I Say: Grounding Language in
+    Robotic Affordances. arXiv preprint arXiv:2204.01691.
+"""
+
+import cv2
+import os
+import sys
+import tempfile
+import numpy as np
+from PIL import Image
+
+# Add the saycan directory to path for imports
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+if SAYCAN_DIR not in sys.path:
+    sys.path.insert(0, SAYCAN_DIR)
+
+from pick_place_env import PickPlaceEnv
+from config import PICK_TARGETS, PLACE_TARGETS
+from cliport import get_cliport
+# Import LLM and helpers for planning
+from llm import make_options, gpt3_scoring, gpt3_context, termination_string
+from helpers import normalize_scores, step_to_nlp, affordance_scoring
+from vild import vild, category_name_string, vild_params
+
+
+class SayCanBaseEnvironment:
+    """Wrapper for the SayCan PickPlaceEnv with LLM planning and CLIPort integration."""
+
+    def __init__(self):
+        """Initialize the environment."""
+        self.env = PickPlaceEnv()
+        self.config = None
+        self._step_count = 0
+        self._max_steps = 100
+        self._cliport = None
+        self.cached_video_frames = []
+
+        # LLM planning state
+        self._current_task = None
+        self._max_tasks = 10
+        self._gpt3_prompt = None
+        self._options = None
+        self._found_objects = None
+        self._task_step_count = 0
+
+    def reset(self, config=None):
+        """
+        Reset the environment to an initial state.
+
+        Args:
+            config: Optional configuration dict with 'pick' and 'place' lists.
+                   If None, uses default objects.
+
+        Returns:
+            observation: Initial observation dict with 'image', 'xyzmap', 'pick', 'place'
+            info: Additional information dict
+        """
+        self._step_count = 0
+        self.cached_video_frames = []
+
+        # Reset LLM planning state
+        self._current_task = None
+        self._gpt3_prompt = None
+        self._options = None
+        self._found_objects = None
+        self._task_step_count = 0
+
+        if config is None:
+            config = {'pick':  ['yellow block', 'blue block', 'red block'],
+                      'place': ['blue bowl', 'red bowl']}
+
+        self.config = config
+        observation = self.env.reset(config)
+
+        info = {
+            "step": 0,
+            "config": config,
+            "pick_objects": config.get("pick", []),
+            "place_objects": config.get("place", [])
+        }
+
+        return observation, info
+
+    def set_task(self, task_text):
+        """
+        Set the current task from natural language.
+
+        Args:
+            task_text: Task instruction (e.g., "put all the blocks in different corners")
+        """
+        self._current_task = task_text
+        self._gpt3_prompt = gpt3_context + "\n# " + task_text + "\n"
+        self._task_step_count = 0
+        self._found_objects = None
+        self._options = None
+        print(f"Environment: Task set to '{task_text}'")
+
+    def detect_objects(self, observation=None):
+        """
+        Detect objects in the scene using ViLD.
+
+        Args:
+            observation: Observation dict with 'image'. If None, uses current observation.
+
+        Returns:
+            found_objects: List of detected object names
+        """
+        if observation is None:
+            observation = self.env.get_observation()
+
+        # Save image to temp file for ViLD
+        image = observation['image']
+        with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as f:
+            temp_path = f.name
+            Image.fromarray(image).save(temp_path)
+
+        try:
+            # Run ViLD detection
+            prompt_swaps = [('block', 'cube')]
+            found_objects = vild(temp_path, category_name_string, vild_params,
+                                plot_on=False, prompt_swaps=prompt_swaps)
+            print(f"Environment: Detected objects: {found_objects}")
+        finally:
+            # Clean up temp file
+            os.unlink(temp_path)
+
+        return found_objects
+
+    def plan_next_action(self, observation=None):
+        """
+        Plan the next action using LLM + affordance scoring.
+
+        Args:
+            observation: Current observation. If None, uses current observation.
+
+        Returns:
+            action_text: Natural language action instruction
+            done: Whether the task is complete
+        """
+        if observation is None:
+            observation = self.env.get_observation()
+
+        # Detect objects if not already done
+        if self._found_objects is None:
+            self._found_objects = self.detect_objects(observation)
+
+        # Create options if not already done
+        if self._options is None:
+            self._options = make_options(PICK_TARGETS, PLACE_TARGETS,
+                                         termination_string=termination_string)
+
+        # Calculate affordance scores based on detected objects
+        affordance_scores = affordance_scoring(self._options, self._found_objects,
+                                               block_name="box", bowl_name="circle",
+                                               verbose=False)
+
+        # Get LLM scores
+        llm_scores, _ = gpt3_scoring(self._gpt3_prompt, self._options, verbose=True)
+
+        # Combine scores
+        combined_scores = {
+            option: np.exp(llm_scores[option]) * affordance_scores[option]
+            for option in self._options
+        }
+        combined_scores = normalize_scores(combined_scores)
+
+        # Select best action
+        selected_task = max(combined_scores, key=combined_scores.get)
+
+        # Check for termination
+        if selected_task == termination_string:
+            print("Environment: Task completed (termination signal)")
+            return "done", True
+
+        # Update prompt for next step
+        self._gpt3_prompt += selected_task + "\n"
+        self._task_step_count += 1
+
+        # Check max tasks limit
+        if self._task_step_count >= self._max_tasks:
+            print("Environment: Max steps reached")
+            return "done", True
+
+        # Convert to natural language
+        action_text = step_to_nlp(selected_task)
+        print(f"Environment: Step {self._task_step_count} - {action_text}")
+        return action_text, False
+
+    def step(self, action_dict):
+        """
+        Execute one step in the environment.
+
+        Args:
+            action_dict: Dictionary with agent id as keys and action as value.
+                        Action can be:
+                        - string text instruction directly
+                        - "task:<description>" to set a task and auto-plan
+                        - "plan" to get the next planned action
+                        - "done" to end the episode
+
+        Returns:
+            observation: New observation dict
+            reward: Reward for the action (float)
+            terminated: Whether the episode has ended (bool)
+            truncated: Whether the episode was truncated (bool)
+            info: Additional information (dict)
+        """
+        self._step_count += 1
+
+        if len(self.cached_video_frames) > 0:
+            return np.array([]), 0.0, False, False, {"info": "No action taken"}
+
+        # Extract action from dict (single-agent environment)
+        action = list(action_dict.values())[0] if isinstance(action_dict, dict) else action_dict
+
+        # Handle different action types
+        if action == 'done':
+            return np.array([]), 0.0, True, False, {"info": "Task completed"}
+        elif isinstance(action, str) and action.startswith('task:'):
+            # Execute complete task automatically
+            task_text = action[5:].strip()
+            results = self.run_task(task_text)
+            return np.array([]), results["total_reward"], False, False, results
+        elif action:
+            # Direct text instruction
+            obs, reward, _, info = self._step_with_text(action)
+            # Get the frames buffer
+            self.cached_video_frames = self.env.cache_video
+        else:
+            return np.array([]), 0.0, False, False, {"info": "No action taken"}
+
+        # Check termination conditions
+        terminated = False
+        truncated = self._step_count >= self._max_steps
+
+        info["step"] = self._step_count
+        info["max_steps"] = self._max_steps
+
+        return obs, reward, terminated, truncated, info
+
+    def _step_with_text(self, text):
+        """Execute a step using CLIPort with text instruction."""
+        if self._cliport is None:
+            self._cliport = get_cliport()
+
+        # Get current observation
+        obs = self.env.get_observation()
+
+        # Use CLIPort to predict action
+        action = self._cliport.predict(obs, text)
+
+        # Execute the predicted action
+        obs, reward, done, info = self.env.step({
+            'pick': action['pick'],
+            'place': action['place']
+        })
+
+        info['text_instruction'] = text
+        info['cliport_action'] = action
+
+        return obs, reward, done, info
+
+    def render(self):
+        """Render the environment."""
+        if len(self.cached_video_frames) > 0:
+            return cv2.cvtColor(self.cached_video_frames.pop(0), cv2.COLOR_BGR2RGB)
+        return cv2.cvtColor(self.env.get_camera_image(), cv2.COLOR_BGR2RGB)
+
+    def get_observation(self):
+        """Get current observation without stepping."""
+        return self.env.get_observation()
+
+    def run_task(self, task_text, max_steps=5):
+        """
+        Execute a complete task from start to finish with automatic planning.
+
+        This method sets the task and automatically executes all planned actions
+        until completion, without requiring manual 'plan' calls between steps.
+
+        Args:
+            task_text: Natural language task description (e.g., "put all blocks in bowls")
+            max_steps: Maximum number of actions to execute (default: 50)
+
+        Returns:
+            results: Dictionary containing:
+                - task: The original task text
+                - completed: Whether the task completed successfully
+                - steps: List of executed steps with actions and rewards
+                - total_reward: Cumulative reward across all steps
+                - termination_reason: Why execution stopped
+        """
+        self.set_task(task_text)
+
+        results = {
+            "task": task_text,
+            "completed": False,
+            "steps": [],
+            "total_reward": 0.0,
+            "termination_reason": None
+        }
+
+        for step in range(max_steps):
+            # Plan next action
+            action_text, task_done = self.plan_next_action()
+
+            # Check for task completion signal from LLM
+            if task_done or action_text == "done":
+                results["completed"] = True
+                results["termination_reason"] = "task_done"
+                break
+
+            # Execute the planned action
+            obs, reward, _, info = self._step_with_text(action_text)
+            results["total_reward"] += reward
+
+            results["steps"].append({
+                "step": step,
+                "action": action_text,
+                "reward": reward,
+                "info": info
+            })
+
+        # Cache final video frames for rendering
+        self.cached_video_frames = self.env.cache_video
+
+        return results
\ No newline at end of file
diff --git a/saycan/environment.py b/saycan/environment.py
index 7814c5c..f44f8ef 100644
--- a/saycan/environment.py
+++ b/saycan/environment.py
@@ -23,190 +23,14 @@
     ... & Zeng, A. (2022). Do As I Can, Not As I Say: Grounding Language in
     Robotic Affordances. arXiv preprint arXiv:2204.01691.
 """
+from base_environment import SayCanBaseEnvironment
 
-import cv2
-import os
-import sys
-import tempfile
-import numpy as np
-from PIL import Image
-
-# Add the saycan directory to path for imports
-SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
-if SAYCAN_DIR not in sys.path:
-    sys.path.insert(0, SAYCAN_DIR)
-
-from pick_place_env import PickPlaceEnv
-from config import PICK_TARGETS, PLACE_TARGETS
-from cliport import get_cliport
-# Import LLM and helpers for planning
-from llm import make_options, gpt3_scoring, gpt3_context, termination_string
-from helpers import normalize_scores, step_to_nlp, affordance_scoring
-from vild import vild, category_name_string, vild_params
-
-
-class EnvironmentWrapper:
+class EnvironmentWrapper(SayCanBaseEnvironment):
     """Wrapper for the SayCan PickPlaceEnv with LLM planning and CLIPort integration."""
 
     def __init__(self):
         """Initialize the environment."""
-        self.env = PickPlaceEnv()
-        self.config = None
-        self._step_count = 0
-        self._max_steps = 100
-        self._cliport = None
-        self.cached_video_frames = []
-
-        # LLM planning state
-        self._current_task = None
-        self._max_tasks = 10
-        self._gpt3_prompt = None
-        self._options = None
-        self._found_objects = None
-        self._task_step_count = 0
-
-    def reset(self, config=None):
-        """
-        Reset the environment to an initial state.
-
-        Args:
-            config: Optional configuration dict with 'pick' and 'place' lists.
-                   If None, uses default objects.
-
-        Returns:
-            observation: Initial observation dict with 'image', 'xyzmap', 'pick', 'place'
-            info: Additional information dict
-        """
-        self._step_count = 0
-        self.cached_video_frames = []
-
-        # Reset LLM planning state
-        self._current_task = None
-        self._gpt3_prompt = None
-        self._options = None
-        self._found_objects = None
-        self._task_step_count = 0
-
-        if config is None:
-            config = {'pick':  ['yellow block', 'blue block', 'red block'],
-                      'place': ['blue bowl', 'red bowl']}
-
-        self.config = config
-        observation = self.env.reset(config)
-
-        info = {
-            "step": 0,
-            "config": config,
-            "pick_objects": config.get("pick", []),
-            "place_objects": config.get("place", [])
-        }
-
-        return observation, info
-
-    def set_task(self, task_text):
-        """
-        Set the current task from natural language.
-
-        Args:
-            task_text: Task instruction (e.g., "put all the blocks in different corners")
-        """
-        self._current_task = task_text
-        self._gpt3_prompt = gpt3_context + "\n# " + task_text + "\n"
-        self._task_step_count = 0
-        self._found_objects = None
-        self._options = None
-        print(f"Environment: Task set to '{task_text}'")
-
-    def detect_objects(self, observation=None):
-        """
-        Detect objects in the scene using ViLD.
-
-        Args:
-            observation: Observation dict with 'image'. If None, uses current observation.
-
-        Returns:
-            found_objects: List of detected object names
-        """
-        if observation is None:
-            observation = self.env.get_observation()
-
-        # Save image to temp file for ViLD
-        image = observation['image']
-        with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as f:
-            temp_path = f.name
-            Image.fromarray(image).save(temp_path)
-
-        try:
-            # Run ViLD detection
-            prompt_swaps = [('block', 'cube')]
-            found_objects = vild(temp_path, category_name_string, vild_params,
-                                plot_on=False, prompt_swaps=prompt_swaps)
-            print(f"Environment: Detected objects: {found_objects}")
-        finally:
-            # Clean up temp file
-            os.unlink(temp_path)
-
-        return found_objects
-
-    def plan_next_action(self, observation=None):
-        """
-        Plan the next action using LLM + affordance scoring.
-
-        Args:
-            observation: Current observation. If None, uses current observation.
-
-        Returns:
-            action_text: Natural language action instruction
-            done: Whether the task is complete
-        """
-        if observation is None:
-            observation = self.env.get_observation()
-
-        # Detect objects if not already done
-        if self._found_objects is None:
-            self._found_objects = self.detect_objects(observation)
-
-        # Create options if not already done
-        if self._options is None:
-            self._options = make_options(PICK_TARGETS, PLACE_TARGETS,
-                                         termination_string=termination_string)
-
-        # Calculate affordance scores based on detected objects
-        affordance_scores = affordance_scoring(self._options, self._found_objects,
-                                               block_name="box", bowl_name="circle",
-                                               verbose=False)
-
-        # Get LLM scores
-        llm_scores, _ = gpt3_scoring(self._gpt3_prompt, self._options, verbose=True)
-
-        # Combine scores
-        combined_scores = {
-            option: np.exp(llm_scores[option]) * affordance_scores[option]
-            for option in self._options
-        }
-        combined_scores = normalize_scores(combined_scores)
-
-        # Select best action
-        selected_task = max(combined_scores, key=combined_scores.get)
-
-        # Check for termination
-        if selected_task == termination_string:
-            print("Environment: Task completed (termination signal)")
-            return "done", True
-
-        # Update prompt for next step
-        self._gpt3_prompt += selected_task + "\n"
-        self._task_step_count += 1
-
-        # Check max tasks limit
-        if self._task_step_count >= self._max_tasks:
-            print("Environment: Max steps reached")
-            return "done", True
-
-        # Convert to natural language
-        action_text = step_to_nlp(selected_task)
-        print(f"Environment: Step {self._task_step_count} - {action_text}")
-        return action_text, False
+        super().__init__()
 
     def step(self, action_dict):
         """
@@ -227,126 +51,9 @@ def step(self, action_dict):
             truncated: Whether the episode was truncated (bool)
             info: Additional information (dict)
         """
-        self._step_count += 1
-
-        if len(self.cached_video_frames) > 0:
-            return np.array([]), 0.0, False, False, {"info": "No action taken"}
-
         # Extract action from dict (single-agent environment)
         action = list(action_dict.values())[0] if isinstance(action_dict, dict) else action_dict
-
-        # Handle different action types
-        if action == 'done':
-            return np.array([]), 0.0, True, False, {"info": "Task completed"}
-        elif isinstance(action, str) and action.startswith('task:'):
-            # Execute complete task automatically
-            task_text = action[5:].strip()
-            results = self.run_task(task_text)
-            return np.array([]), results["total_reward"], False, False, results
-        elif action:
-            # Direct text instruction
-            obs, reward, _, info = self._step_with_text(action)
-            # Get the frames buffer
-            self.cached_video_frames = self.env.cache_video
-        else:
-            return np.array([]), 0.0, False, False, {"info": "No action taken"}
-
-        # Check termination conditions
-        terminated = False
-        truncated = self._step_count >= self._max_steps
-
-        info["step"] = self._step_count
-        info["max_steps"] = self._max_steps
-
-        return obs, reward, terminated, truncated, info
-
-    def _step_with_text(self, text):
-        """Execute a step using CLIPort with text instruction."""
-        if self._cliport is None:
-            self._cliport = get_cliport()
-
-        # Get current observation
-        obs = self.env.get_observation()
-
-        # Use CLIPort to predict action
-        action = self._cliport.predict(obs, text)
-
-        # Execute the predicted action
-        obs, reward, done, info = self.env.step({
-            'pick': action['pick'],
-            'place': action['place']
-        })
-
-        info['text_instruction'] = text
-        info['cliport_action'] = action
-
-        return obs, reward, done, info
-
-    def render(self):
-        """Render the environment."""
-        if len(self.cached_video_frames) > 0:
-            return cv2.cvtColor(self.cached_video_frames.pop(0), cv2.COLOR_BGR2RGB)
-        return cv2.cvtColor(self.env.get_camera_image(), cv2.COLOR_BGR2RGB)
-
-    def get_observation(self):
-        """Get current observation without stepping."""
-        return self.env.get_observation()
-
-    def run_task(self, task_text, max_steps=5):
-        """
-        Execute a complete task from start to finish with automatic planning.
-
-        This method sets the task and automatically executes all planned actions
-        until completion, without requiring manual 'plan' calls between steps.
-
-        Args:
-            task_text: Natural language task description (e.g., "put all blocks in bowls")
-            max_steps: Maximum number of actions to execute (default: 50)
-
-        Returns:
-            results: Dictionary containing:
-                - task: The original task text
-                - completed: Whether the task completed successfully
-                - steps: List of executed steps with actions and rewards
-                - total_reward: Cumulative reward across all steps
-                - termination_reason: Why execution stopped
-        """
-        self.set_task(task_text)
-
-        results = {
-            "task": task_text,
-            "completed": False,
-            "steps": [],
-            "total_reward": 0.0,
-            "termination_reason": None
-        }
-
-        for step in range(max_steps):
-            # Plan next action
-            action_text, task_done = self.plan_next_action()
-
-            # Check for task completion signal from LLM
-            if task_done or action_text == "done":
-                results["completed"] = True
-                results["termination_reason"] = "task_done"
-                break
-
-            # Execute the planned action
-            obs, reward, _, info = self._step_with_text(action_text)
-            results["total_reward"] += reward
-
-            results["steps"].append({
-                "step": step,
-                "action": action_text,
-                "reward": reward,
-                "info": info
-            })
-
-        # Cache final video frames for rendering
-        self.cached_video_frames = self.env.cache_video
-
-        return results
-
+        return super().step(action)
 
 # Create the environment instance for SHARPIE runner
 environment = EnvironmentWrapper()
\ No newline at end of file

From 16eca38dc52508b5f0fa5ad117bd72f39ffe8824 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Libio=20Gon=C3=A7alves=20Braz?=
 <86714505+libgoncalv@users.noreply.github.com>
Date: Thu, 26 Feb 2026 17:47:56 +0100
Subject: [PATCH 4/4] Update environment.py to include saycan directory

Add directory to path for imports in environment.py
---
 saycan/environment.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/saycan/environment.py b/saycan/environment.py
index f44f8ef..effd2c5 100644
--- a/saycan/environment.py
+++ b/saycan/environment.py
@@ -23,6 +23,14 @@
     ... & Zeng, A. (2022). Do As I Can, Not As I Say: Grounding Language in
     Robotic Affordances. arXiv preprint arXiv:2204.01691.
 """
+import os 
+import sys
+
+# Add the saycan directory to path for imports
+SAYCAN_DIR = os.path.dirname(os.path.abspath(__file__))
+if SAYCAN_DIR not in sys.path:
+    sys.path.insert(0, SAYCAN_DIR)
+    
 from base_environment import SayCanBaseEnvironment
 
 class EnvironmentWrapper(SayCanBaseEnvironment):
@@ -56,4 +64,4 @@ def step(self, action_dict):
         return super().step(action)
 
 # Create the environment instance for SHARPIE runner
-environment = EnvironmentWrapper()
\ No newline at end of file
+environment = EnvironmentWrapper()