From a38cb2d5aed419ac20f74506232aa4fd129519ae Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Thu, 6 Nov 2025 10:09:50 -0800 Subject: [PATCH 01/20] feat: load processing_manifest.json from main folder if necessary --- src/see_spot/app.py | 39 ++++++++++++++++++++++++--------------- src/see_spot/s3_utils.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index ceed09e..492da75 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -19,7 +19,8 @@ from see_spot.s3_utils import ( find_unmixed_spots_file, find_related_files, load_ratios_from_s3, load_summary_stats_from_s3, - load_processing_manifest_from_s3, load_and_merge_spots_from_s3 + load_processing_manifest_from_s3, load_and_merge_spots_from_s3, + find_processing_manifest ) logging.basicConfig(level=logging.INFO) @@ -265,10 +266,14 @@ async def get_real_spots_data( processing_manifest = df_cache.get("processing_manifest") spot_channels_from_manifest = df_cache.get("spot_channels_from_manifest") if not processing_manifest or not spot_channels_from_manifest: - # Construct manifest path and load - manifest_key = f"{DATA_PREFIX}/derived/processing_manifest.json" - logger.info(f"Attempting to load processing manifest from: s3://{S3_BUCKET}/{manifest_key}") - processing_manifest = load_processing_manifest_from_s3(S3_BUCKET, manifest_key) + # Find manifest in either top level or derived folder + manifest_key = find_processing_manifest(S3_BUCKET, DATA_PREFIX) + if not manifest_key: + logger.error(f"Could not find processing_manifest.json for dataset {DATA_PREFIX}") + spot_channels_from_manifest = [] + else: + logger.info(f"Attempting to load processing manifest from: s3://{S3_BUCKET}/{manifest_key}") + processing_manifest = load_processing_manifest_from_s3(S3_BUCKET, manifest_key) if processing_manifest and "spot_channels" in processing_manifest: spot_channels_from_manifest = processing_manifest["spot_channels"] df_cache["processing_manifest"] = processing_manifest @@ -280,7 +285,11 @@ async def get_real_spots_data( else: # Need to load DataFrame from S3 # 1. Load processing manifest to determine paths and channels - manifest_key = f"{DATA_PREFIX}/derived/processing_manifest.json" + manifest_key = find_processing_manifest(S3_BUCKET, DATA_PREFIX) + if not manifest_key: + logger.error(f"Could not find processing_manifest.json for dataset {DATA_PREFIX}.") + return JSONResponse(status_code=500, content={'error': 'Failed to find processing manifest'}) + logger.info(f"Attempting to load processing manifest from: s3://{S3_BUCKET}/{manifest_key}") processing_manifest = load_processing_manifest_from_s3(S3_BUCKET, manifest_key) @@ -586,22 +595,22 @@ async def download_dataset(request: Request): return JSONResponse(status_code=400, content={"error": "Dataset name is required"}) # Check if dataset exists on S3 by looking for the processing manifest - manifest_key = f"{dataset_name}/derived/processing_manifest.json" + manifest_key = find_processing_manifest(S3_BUCKET, dataset_name) - logger.info(f"Checking if dataset exists: s3://{S3_BUCKET}/{manifest_key}") - - # Try to get the manifest to verify the dataset exists - manifest_content = s3_handler.get_object(key=manifest_key, bucket_name=S3_BUCKET) - - if manifest_content is None: + if not manifest_key: return JSONResponse( status_code=404, content={ - "error": f"Dataset not found on S3", - "checked_path": f"s3://{S3_BUCKET}/{manifest_key}" + "error": f"Dataset not found on S3 - processing_manifest.json not found", + "checked_paths": [ + f"s3://{S3_BUCKET}/{dataset_name}/processing_manifest.json", + f"s3://{S3_BUCKET}/{dataset_name}/derived/processing_manifest.json" + ] } ) + logger.info(f"Found dataset manifest at: s3://{S3_BUCKET}/{manifest_key}") + # Download the processing manifest first manifest_local_path = s3_handler.download_file( key=manifest_key, diff --git a/src/see_spot/s3_utils.py b/src/see_spot/s3_utils.py index 93f55fa..f65f655 100644 --- a/src/see_spot/s3_utils.py +++ b/src/see_spot/s3_utils.py @@ -14,6 +14,41 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) + +def find_processing_manifest(bucket: str, dataset_name: str) -> Optional[str]: + """ + Find the processing_manifest.json file in either the top level or derived folder. + + Args: + bucket: S3 bucket name + dataset_name: Dataset name/prefix + + Returns: + Full S3 key to the manifest file, or None if not found + """ + # Try both possible locations + possible_paths = [ + f"{dataset_name}/processing_manifest.json", # Top level + f"{dataset_name}/derived/processing_manifest.json" # Derived folder + ] + + logger.info(f"Searching for processing_manifest.json in dataset '{dataset_name}'") + + for manifest_key in possible_paths: + logger.info(f"Checking: s3://{bucket}/{manifest_key}") + try: + # Try to get metadata (faster than downloading) + metadata = s3_handler.get_object_metadata(key=manifest_key, bucket_name=bucket) + if metadata is not None: + logger.info(f"Found processing manifest at: {manifest_key}") + return manifest_key + except Exception as e: + logger.debug(f"Manifest not found at {manifest_key}: {e}") + continue + + logger.warning(f"Could not find processing_manifest.json in any expected location for dataset '{dataset_name}'") + return None + def optimize_dtypes(df: pl.DataFrame) -> pl.DataFrame: """Optimize DataFrame dtypes to reduce memory usage. From 1f52f60a91d6016c05a0161d00ac1fddaf61c864 Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Thu, 6 Nov 2025 14:37:20 -0800 Subject: [PATCH 02/20] added pointer to new neuroglancer file (stitched) --- src/see_spot/app.py | 68 ++++++++++++++++++----- src/see_spot/ng_utils.py | 116 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 15 deletions(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index 492da75..22f693a 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -49,7 +49,8 @@ "target_key": None, "processing_manifest": None, "spot_channels_from_manifest": None, - "sankey_data": None # Cache Sankey data to avoid recalculation + "sankey_data": None, # Cache Sankey data to avoid recalculation + "unmixed_spots_filename": None # Store unmixed spots filename for neuroglancer logic } @@ -344,6 +345,11 @@ async def get_real_spots_data( S3_BUCKET, related_files_prefix, "unmixed_spots_*.pkl" ) + # Store the unmixed spots filename in cache for neuroglancer logic + if unmixed_target_key: + df_cache["unmixed_spots_filename"] = Path(unmixed_target_key).name + logger.info(f"Cached unmixed spots filename: {df_cache['unmixed_spots_filename']}") + if unmixed_target_key: related_files = find_related_files(S3_BUCKET, related_files_prefix, unmixed_target_key) logger.info(f"Searching for related files in '{related_files_prefix}'. Found: {related_files}") @@ -518,29 +524,61 @@ async def create_neuroglancer_link(request: Request): cell_id = data.get("cell_id", 42) # Default value if not provided spot_id = data.get("spot_id") annotation_color = data.get("annotation_color", "#FFFF00") - cross_section_scale = data.get("cross_section_scale", 1.0) + cross_section_scale = data.get("cross_section_scale", "0.135") # Input validation - if not fused_s3_paths or not position or not point_annotation or not spot_id: + if not position or not point_annotation or not spot_id: return JSONResponse( status_code=400, - content={"error": "Missing required parameters: fused_s3_paths, position, point_annotation, or spot_id"} + content={"error": "Missing required parameters: position, point_annotation, or spot_id"} ) try: # Import the ng_utils module from see_spot import ng_utils - # Create the neuroglancer link - ng_link = ng_utils.create_link_no_upload( - fused_s3_paths, - annotation_color=annotation_color, - cross_section_scale=cross_section_scale, - cell_id=cell_id, - spot_id=spot_id, - position=position, - point_annotation=point_annotation - ) + # Check if we should use the JSON-based method (when "merged" is in the pkl filename) + unmixed_spots_filename = df_cache.get("unmixed_spots_filename") or "" + use_json_method = "merged" in unmixed_spots_filename.lower() + + if use_json_method: + # Use the JSON-based method for merged datasets + logger.info(f"Using create_link_from_json method for merged dataset (filename: {unmixed_spots_filename})") + + # Construct the neuroglancer JSON path + ng_json_path = f"s3://{S3_BUCKET}/{DATA_PREFIX}/phase_correlation_stitching_neuroglancer.json" + logger.info(f"Neuroglancer JSON path: {ng_json_path}") + + # Create the neuroglancer link from JSON + ng_link = ng_utils.create_link_from_json( + ng_json_path=ng_json_path, + position=position, + spot_id=spot_id, + point_annotation=point_annotation, + annotation_color=annotation_color, + spacing=3.0, + cross_section_scale=cross_section_scale + ) + else: + # Use the traditional method for non-merged datasets + logger.info(f"Using create_link_no_upload method for non-merged dataset (filename: {unmixed_spots_filename})") + + if not fused_s3_paths: + return JSONResponse( + status_code=400, + content={"error": "Missing required parameter: fused_s3_paths (required for non-merged datasets)"} + ) + + # Create the neuroglancer link + ng_link = ng_utils.create_link_no_upload( + fused_s3_paths, + annotation_color=annotation_color, + cross_section_scale=cross_section_scale, + cell_id=cell_id, + spot_id=spot_id, + position=position, + point_annotation=point_annotation + ) return {"url": ng_link} except Exception as e: @@ -631,7 +669,7 @@ async def download_dataset(request: Request): content={ "error": "Spots data file not found", "checked_path": f"s3://{S3_BUCKET}/{spots_key}unmixed_spots_*.pkl" - } + ) # Try to create the merged parquet file by calling our new merge function diff --git a/src/see_spot/ng_utils.py b/src/see_spot/ng_utils.py index 2dc8f4c..4c74eb4 100644 --- a/src/see_spot/ng_utils.py +++ b/src/see_spot/ng_utils.py @@ -239,6 +239,122 @@ def create_link_no_upload(fused_s3_path, resolution_zyx=None, return direct_url + +def create_link_from_json(ng_json_path, position, spot_id, point_annotation, + annotation_color="#FFFF00", spacing=3.0, + cross_section_scale= None, base_url="https://neuroglancer-demo.appspot.com"): + """ + Create a Neuroglancer link from an existing JSON file with updated position and annotation. + + Parameters: + ----------- + ng_json_path (str or Path): Path to the neuroglancer JSON file (can be local or S3 path) + position (list): New position coordinates [x, y, z, t] + spot_id (int or str): ID for the spot annotation + point_annotation (list): Point annotation coordinates [x, y, z, ...] + annotation_color (str, optional): Hex color for the annotation. Default: "#FFFF00" + spacing (float, optional): Spacing for annotations in cross-section view. Default: 3.0 + cross_section_scale (float, optional): Scale for cross-section view. If None, keeps existing value + base_url (str, optional): Base Neuroglancer URL. Default: "https://neuroglancer-demo.appspot.com" + + Returns: + -------- + str: Direct Neuroglancer URL with updated state + """ + import json + from pathlib import Path + + # Convert to Path object for easier handling + json_path = Path(ng_json_path) if not isinstance(ng_json_path, Path) else ng_json_path + + # Load the JSON file + try: + if str(json_path).startswith('s3://'): + # Handle S3 paths + import boto3 + s3_path = str(json_path)[5:] # Remove 's3://' + parts = s3_path.split('/') + bucket = parts[0] + key = '/'.join(parts[1:]) + + s3_client = boto3.client('s3') + response = s3_client.get_object(Bucket=bucket, Key=key) + json_content = response['Body'].read().decode('utf-8') + state_dict = json.loads(json_content) + print(f"Loaded Neuroglancer state from S3: s3://{bucket}/{key}") + else: + # Handle local file paths + with open(json_path, 'r') as f: + state_dict = json.load(f) + print(f"Loaded Neuroglancer state from local file: {json_path}") + except FileNotFoundError: + raise FileNotFoundError(f"Neuroglancer JSON file not found: {json_path}") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in file {json_path}: {e}") + except Exception as e: + raise Exception(f"Error loading Neuroglancer JSON from {json_path}: {e}") + + # Update position + state_dict["position"] = position + print(f"Updated position to: {position}") + + # Update cross-section scale if provided + if cross_section_scale is not None: + state_dict["crossSectionScale"] = cross_section_scale + print(f"Updated crossSectionScale to: {cross_section_scale}") + + # Find or create annotation layer + annotation_layer_found = False + + if "layers" in state_dict: + # Look for existing annotation layer + for i, layer in enumerate(state_dict["layers"]): + if layer.get("type") == "annotation": + # Update existing annotation layer + annotation = { + "type": "point", + "id": str(spot_id), + "point": point_annotation, + } + + # Update the layer properties + state_dict["layers"][i]["name"] = f"Spot {spot_id}" + state_dict["layers"][i]["annotationColor"] = annotation_color + state_dict["layers"][i]["crossSectionAnnotationSpacing"] = spacing + state_dict["layers"][i]["annotations"] = [annotation] + + annotation_layer_found = True + print(f"Updated existing annotation layer with spot {spot_id}") + break + + # If no annotation layer exists, create one + if not annotation_layer_found: + annotation_layer = { + "type": "annotation", + "name": f"Spot {spot_id}", + "tab": "annotations", + "visible": True, + "annotationColor": annotation_color, + "crossSectionAnnotationSpacing": spacing, + "projectionAnnotationSpacing": 10, + "tool": "annotatePoint", + "annotations": [{ + "type": "point", + "id": str(spot_id), + "point": point_annotation, + }] + } + state_dict["layers"].append(annotation_layer) + print(f"Created new annotation layer with spot {spot_id}") + else: + print("Warning: No 'layers' found in Neuroglancer state") + + # Generate direct URL + direct_url = create_direct_neuroglancer_url(state_dict, base_url=base_url) + + return direct_url + + def read_zarr_resolution_boto(s3_path): """ Read resolution from zarr using direct S3 access via boto3 From 903c6bd8becc8292e93ddd4b3b4e48b1cd45f1d6 Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 11:09:29 -0800 Subject: [PATCH 03/20] refactor: linting etc, + fixing zag's mess --- src/see_spot/app.py | 69 +++---- src/see_spot/ng_utils.py | 334 ++++++++++++++++++--------------- src/see_spot/s3_handler.py | 219 ++++++++++++---------- src/see_spot/s3_utils.py | 374 +++++++++++++++++++++++-------------- 4 files changed, 579 insertions(+), 417 deletions(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index 22f693a..4d4ef88 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from datetime import datetime, timedelta +from see_spot import ng_utils import uvicorn import logging import os @@ -51,6 +52,7 @@ "spot_channels_from_manifest": None, "sankey_data": None, # Cache Sankey data to avoid recalculation "unmixed_spots_filename": None # Store unmixed spots filename for neuroglancer logic + "unmixed_spots_filename": None # Store unmixed spots filename for neuroglancer logic } @@ -344,6 +346,11 @@ async def get_real_spots_data( unmixed_target_key = find_unmixed_spots_file( S3_BUCKET, related_files_prefix, "unmixed_spots_*.pkl" ) + # Store the unmixed spots filename in cache for neuroglancer logic + if unmixed_target_key: + df_cache["unmixed_spots_filename"] = Path(unmixed_target_key).name + logger.info(f"Cached unmixed spots filename: {df_cache['unmixed_spots_filename']}") + # Store the unmixed spots filename in cache for neuroglancer logic if unmixed_target_key: @@ -518,13 +525,12 @@ async def create_neuroglancer_link(request: Request): data = await request.json() # Extract the parameters from the request - fused_s3_paths = data.get("fused_s3_paths") + cross_section_scale = data.get("cross_section_scale", "0.135") + spot_id = data.get("spot_id") position = data.get("position") point_annotation = data.get("point_annotation") - cell_id = data.get("cell_id", 42) # Default value if not provided - spot_id = data.get("spot_id") - annotation_color = data.get("annotation_color", "#FFFF00") - cross_section_scale = data.get("cross_section_scale", "0.135") + if not position or not point_annotation or not spot_id: + annotation_color = data.get("annotation_color", "#FFFF00") # Input validation if not position or not point_annotation or not spot_id: @@ -532,37 +538,34 @@ async def create_neuroglancer_link(request: Request): status_code=400, content={"error": "Missing required parameters: position, point_annotation, or spot_id"} ) - - try: - # Import the ng_utils module - from see_spot import ng_utils - - # Check if we should use the JSON-based method (when "merged" is in the pkl filename) - unmixed_spots_filename = df_cache.get("unmixed_spots_filename") or "" - use_json_method = "merged" in unmixed_spots_filename.lower() - + # Check if we should use the JSON-based method (when "merged" is in the pkl filename) + unmixed_spots_filename = df_cache.get("unmixed_spots_filename") or "" + use_json_method = "merged" in unmixed_spots_filename.lower() + try: if use_json_method: - # Use the JSON-based method for merged datasets - logger.info(f"Using create_link_from_json method for merged dataset (filename: {unmixed_spots_filename})") - - # Construct the neuroglancer JSON path - ng_json_path = f"s3://{S3_BUCKET}/{DATA_PREFIX}/phase_correlation_stitching_neuroglancer.json" - logger.info(f"Neuroglancer JSON path: {ng_json_path}") - - # Create the neuroglancer link from JSON - ng_link = ng_utils.create_link_from_json( - ng_json_path=ng_json_path, - position=position, - spot_id=spot_id, - point_annotation=point_annotation, - annotation_color=annotation_color, - spacing=3.0, - cross_section_scale=cross_section_scale - ) + # Use the JSON-based method for merged datasets + logger.info(f"Using create_link_from_json method for merged dataset (filename: {unmixed_spots_filename})") + + # Construct the neuroglancer JSON path + ng_json_path = f"s3://{S3_BUCKET}/{DATA_PREFIX}/phase_correlation_stitching_neuroglancer.json" + logger.info(f"Neuroglancer JSON path: {ng_json_path}") + + # Create the neuroglancer link from JSON + ng_link = ng_utils.create_link_from_json( + ng_json_path=ng_json_path, + position=position, + spot_id=spot_id, + point_annotation=point_annotation, + annotation_color=annotation_color, + spacing=3.0, + cross_section_scale=cross_section_scale + ) else: # Use the traditional method for non-merged datasets logger.info(f"Using create_link_no_upload method for non-merged dataset (filename: {unmixed_spots_filename})") - + fused_s3_paths = data.get("fused_s3_paths") + cell_id = data.get("cell_id", 42) # Default value if not provided + if not fused_s3_paths: return JSONResponse( status_code=400, @@ -579,7 +582,7 @@ async def create_neuroglancer_link(request: Request): position=position, point_annotation=point_annotation ) - + return {"url": ng_link} except Exception as e: logger.error(f"Error creating neuroglancer link: {str(e)}") diff --git a/src/see_spot/ng_utils.py b/src/see_spot/ng_utils.py index 4c74eb4..49e6498 100644 --- a/src/see_spot/ng_utils.py +++ b/src/see_spot/ng_utils.py @@ -16,15 +16,19 @@ import json import urllib.parse -def create_direct_neuroglancer_url(json_data, base_url="https://neuroglancer-demo.appspot.com"): + + +def create_direct_neuroglancer_url( + json_data, base_url="https://neuroglancer-demo.appspot.com" +): """ - Creates a direct Neuroglancer URL by removing the ng_link field + Creates a direct Neuroglancer URL by removing the ng_link field and encoding the remaining JSON as part of the URL. - + Args: json_data: Either a JSON string or a Python dictionary containing the Neuroglancer state base_url: The base Neuroglancer URL to use - + Returns: str: A complete Neuroglancer URL with the JSON state encoded in the fragment """ @@ -33,42 +37,54 @@ def create_direct_neuroglancer_url(json_data, base_url="https://neuroglancer-dem data = json.loads(json_data) else: data = json_data.copy() - + # Remove the ng_link key if it exists if "ng_link" in data: del data["ng_link"] - + # Convert to JSON string and encode for URL json_str = json.dumps(data) encoded_json = urllib.parse.quote(json_str) - + # Ensure base URL ends with / - if not base_url.endswith('/'): - base_url += '/' - + if not base_url.endswith("/"): + base_url += "/" + # Create the full URL full_url = f"{base_url}#!{encoded_json}" - + # Check URL length and print warning if too long url_length = len(full_url) print(f"URL character count: {url_length}") - + if url_length > 5000: - print(f"WARNING: URL length ({url_length} characters) exceeds 5000 characters.") + print( + f"WARNING: URL length ({url_length} characters) exceeds 5000 characters." + ) print("This may cause issues in some browsers or web servers.") print("Consider reducing JSON complexity or using a URL shortener.") - + return full_url -def create_link_no_upload(fused_s3_path, resolution_zyx=None, - max_dr=1200, opacity=1.0, blend="additive", - annotation_color="#ff0000", spacing=3.0, cross_section_scale=1.0, - position=None, cell_id: int = 0, spot_id=None, point_annotation=None, - output_folder=None): +def create_link_no_upload( + fused_s3_path, + resolution_zyx=None, + max_dr=1200, + opacity=1.0, + blend="additive", + annotation_color="#ff0000", + spacing=3.0, + cross_section_scale=1.0, + position=None, + cell_id: int = 0, + spot_id=None, + point_annotation=None, + output_folder=None, +): """ Create a Neuroglancer JSON file for multiple channels with a single point annotation. - + Parameters: fused_s3_path (str or list): S3 path(s) to the fused dataset(s). Can be a single string or list of strings. resolution_zyx (list, optional): Resolution in z,y,x order. If None, attempts to read from YAML. @@ -82,85 +98,82 @@ def create_link_no_upload(fused_s3_path, resolution_zyx=None, cell_id (int): cell id to plot (gets added to NG json filename) spot_id (str or int, optional): ID for the spot annotation point_annotation (dict or list, optional): Coordinates [x,y,z] for a single point annotation - + Returns: str: URL to the Neuroglancer link """ # Convert single paths to lists for consistent processing if isinstance(fused_s3_path, str): fused_s3_path = [fused_s3_path] - + # If resolution not provided, try to read from first zarr file if resolution_zyx is None: try: resolution_zyx = read_zarr_resolution_boto(fused_s3_path[0]) print(f"Found resolution from zarr: {resolution_zyx}") except Exception as e: - print(f"Warning: Could not read resolution from zarr file: {str(e)}") + print( + f"Warning: Could not read resolution from zarr file: {str(e)}" + ) # Provide a default resolution if we can't read it resolution_zyx = [1.0, 1.0, 1.0] print(f"Using default resolution: {resolution_zyx}") output_dimensions = { - "x": {"voxel_size": resolution_zyx[2], "unit": "microns"}, - "y": {"voxel_size": resolution_zyx[1], "unit": "microns"}, - "z": {"voxel_size": resolution_zyx[0], "unit": "microns"}, - "c'": {"voxel_size": 1, "unit": ""}, - "t": {"voxel_size": 0.001, "unit": "seconds"}, - } + "x": {"voxel_size": resolution_zyx[2], "unit": "microns"}, + "y": {"voxel_size": resolution_zyx[1], "unit": "microns"}, + "z": {"voxel_size": resolution_zyx[0], "unit": "microns"}, + "c'": {"voxel_size": 1, "unit": ""}, + "t": {"voxel_size": 0.001, "unit": "seconds"}, + } # Initialize layers list layers = [] # Represent Neuroglancer Tabs - + # Process each fused path for idx, fused_path in enumerate(fused_s3_path): # Extract channel number from fused path pattern = r"(ch|CH|channel)_(\d+)" match = re.search(pattern, fused_path) if not match: - raise ValueError(f"Could not extract channel number from path: {fused_path}") - + raise ValueError( + f"Could not extract channel number from path: {fused_path}" + ) + channel = int(match.group(2)) hex_val = wavelength_to_hex_pure_colours(channel) hex_str = f"#{hex_val:06x}" - + # Add image layer image_layer = { "type": "image", "source": fused_path, "channel": 0, - "shaderControls": { - "normalized": {"range": [90, max_dr]} - }, + "shaderControls": {"normalized": {"range": [90, max_dr]}}, "shader": { "color": hex_str, "emitter": "RGB", "vec": "vec3", }, - "localPosition": [ - 0.5 - ], + "localPosition": [0.5], "visible": True, "opacity": opacity, "name": f"CH_{channel}", "blend": blend, } layers.append(image_layer) - - - + # Add specific point annotation if provided if point_annotation is not None: - # convert output_dimensions to a meter]} - # Create a single annotation layer for the point + # Create a single annotation layer for the point annotation_layer = { "type": "annotation", # "source": { # "url":"local://annotations", # "transform": output_dimensions # }, - #"source": "local://annotations", + # "source": "local://annotations", "name": f"Spot {spot_id}", "tab": "annotations", "visible": True, @@ -168,25 +181,23 @@ def create_link_no_upload(fused_s3_path, resolution_zyx=None, "crossSectionAnnotationSpacing": spacing, "projectionAnnotationSpacing": 10, "tool": "annotatePoint", - } - #point = {"x":point_annotation[0], "y":point_annotation[1], "z":point_annotation[2], "t":point_annotation[3]} - + # point = {"x":point_annotation[0], "y":point_annotation[1], "z":point_annotation[2], "t":point_annotation[3]} annotation = { "type": "point", "id": str(spot_id) if spot_id is not None else "spot", "point": point_annotation, - #"description": f"Spot ID: {spot_id}" if spot_id is not None else "Point annotation" + # "description": f"Spot ID: {spot_id}" if spot_id is not None else "Point annotation" } annotation_layer["annotations"] = [annotation] - + # Use the point coordinates as the position if no position is specified if position is None: position = point + [0] # Add time dimension (t=0) - + # Add the annotation layer annotation_layer print(f"annotation_layer: {annotation_layer}") @@ -199,38 +210,37 @@ def create_link_no_upload(fused_s3_path, resolution_zyx=None, "showAxisLines": False, } - # Extract bucket and dataset from first fused path - parts = fused_s3_path[0].split('/') + parts = fused_s3_path[0].split("/") bucket_name = parts[2] dataset_name = parts[3] - + # Set up output folder if output_folder is None: cd = os.getcwd() output_folder = f"{cd}/{dataset_name}/" if not pathlib.Path(output_folder).exists(): pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True) - + # Create JSON file name json_name = f"point_annotation_ng_link_{spot_id if spot_id is not None else 'spot'}.json" - + # Generate the Neuroglancer state neuroglancer_link = NgState( input_config, "s3", bucket_name, output_folder, - dataset_name = pathlib.Path(output_folder).stem, - base_url="https://neuroglancer-demo.appspot.com", + dataset_name=pathlib.Path(output_folder).stem, + base_url="https://neuroglancer-demo.appspot.com", json_name=json_name, ) state_dict = neuroglancer_link.state # add crossSectionScale to state_dict # append annotation_layer to state_dict["layers"] - #annotation_layer["source"]["transform"] = state_dict["dimensions"] # THIS BRINGS METERS IN - + # annotation_layer["source"]["transform"] = state_dict["dimensions"] # THIS BRINGS METERS IN + state_dict["layers"].append(annotation_layer) state_dict["crossSectionScale"] = cross_section_scale state_dict["position"] = position @@ -240,72 +250,88 @@ def create_link_no_upload(fused_s3_path, resolution_zyx=None, return direct_url -def create_link_from_json(ng_json_path, position, spot_id, point_annotation, - annotation_color="#FFFF00", spacing=3.0, - cross_section_scale= None, base_url="https://neuroglancer-demo.appspot.com"): +def create_link_from_json( + ng_json_path, + position, + spot_id, + point_annotation, + annotation_color="#FFFF00", + spacing=3.0, + cross_section_scale=None, + base_url="https://neuroglancer-demo.appspot.com", +): """ Create a Neuroglancer link from an existing JSON file with updated position and annotation. - + Parameters: ----------- ng_json_path (str or Path): Path to the neuroglancer JSON file (can be local or S3 path) position (list): New position coordinates [x, y, z, t] spot_id (int or str): ID for the spot annotation - point_annotation (list): Point annotation coordinates [x, y, z, ...] + point_annotation (list): Point annotation coordinates [x, y, z, ...] annotation_color (str, optional): Hex color for the annotation. Default: "#FFFF00" spacing (float, optional): Spacing for annotations in cross-section view. Default: 3.0 cross_section_scale (float, optional): Scale for cross-section view. If None, keeps existing value base_url (str, optional): Base Neuroglancer URL. Default: "https://neuroglancer-demo.appspot.com" - + Returns: -------- str: Direct Neuroglancer URL with updated state """ import json from pathlib import Path - + # Convert to Path object for easier handling - json_path = Path(ng_json_path) if not isinstance(ng_json_path, Path) else ng_json_path - + json_path = ( + Path(ng_json_path) + if not isinstance(ng_json_path, Path) + else ng_json_path + ) + # Load the JSON file try: - if str(json_path).startswith('s3://'): + if str(json_path).startswith("s3://"): # Handle S3 paths import boto3 + s3_path = str(json_path)[5:] # Remove 's3://' - parts = s3_path.split('/') + parts = s3_path.split("/") bucket = parts[0] - key = '/'.join(parts[1:]) - - s3_client = boto3.client('s3') + key = "/".join(parts[1:]) + + s3_client = boto3.client("s3") response = s3_client.get_object(Bucket=bucket, Key=key) - json_content = response['Body'].read().decode('utf-8') + json_content = response["Body"].read().decode("utf-8") state_dict = json.loads(json_content) print(f"Loaded Neuroglancer state from S3: s3://{bucket}/{key}") else: # Handle local file paths - with open(json_path, 'r') as f: + with open(json_path, "r") as f: state_dict = json.load(f) print(f"Loaded Neuroglancer state from local file: {json_path}") except FileNotFoundError: - raise FileNotFoundError(f"Neuroglancer JSON file not found: {json_path}") + raise FileNotFoundError( + f"Neuroglancer JSON file not found: {json_path}" + ) except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON in file {json_path}: {e}") except Exception as e: - raise Exception(f"Error loading Neuroglancer JSON from {json_path}: {e}") - + raise Exception( + f"Error loading Neuroglancer JSON from {json_path}: {e}" + ) + # Update position state_dict["position"] = position print(f"Updated position to: {position}") - + # Update cross-section scale if provided if cross_section_scale is not None: state_dict["crossSectionScale"] = cross_section_scale print(f"Updated crossSectionScale to: {cross_section_scale}") - + # Find or create annotation layer annotation_layer_found = False - + if "layers" in state_dict: # Look for existing annotation layer for i, layer in enumerate(state_dict["layers"]): @@ -316,17 +342,19 @@ def create_link_from_json(ng_json_path, position, spot_id, point_annotation, "id": str(spot_id), "point": point_annotation, } - + # Update the layer properties state_dict["layers"][i]["name"] = f"Spot {spot_id}" state_dict["layers"][i]["annotationColor"] = annotation_color - state_dict["layers"][i]["crossSectionAnnotationSpacing"] = spacing + state_dict["layers"][i][ + "crossSectionAnnotationSpacing" + ] = spacing state_dict["layers"][i]["annotations"] = [annotation] - + annotation_layer_found = True print(f"Updated existing annotation layer with spot {spot_id}") break - + # If no annotation layer exists, create one if not annotation_layer_found: annotation_layer = { @@ -338,20 +366,22 @@ def create_link_from_json(ng_json_path, position, spot_id, point_annotation, "crossSectionAnnotationSpacing": spacing, "projectionAnnotationSpacing": 10, "tool": "annotatePoint", - "annotations": [{ - "type": "point", - "id": str(spot_id), - "point": point_annotation, - }] + "annotations": [ + { + "type": "point", + "id": str(spot_id), + "point": point_annotation, + } + ], } state_dict["layers"].append(annotation_layer) print(f"Created new annotation layer with spot {spot_id}") else: print("Warning: No 'layers' found in Neuroglancer state") - + # Generate direct URL direct_url = create_direct_neuroglancer_url(state_dict, base_url=base_url) - + return direct_url @@ -359,66 +389,75 @@ def read_zarr_resolution_boto(s3_path): """ Read resolution from zarr using direct S3 access via boto3 found s3fs/zarr was not working, so using boto3 (MD) - + Parameters: s3_path (str): S3 path to the zarr dataset - + Returns: list: Resolution in z,y,x order in micrometers """ import boto3 import json - + # Parse the S3 path - if s3_path.startswith('s3://'): + if s3_path.startswith("s3://"): s3_path = s3_path[5:] # Remove 's3://' - - parts = s3_path.split('/') + + parts = s3_path.split("/") bucket = parts[0] - prefix = '/'.join(parts[1:]) - + prefix = "/".join(parts[1:]) + # Create boto3 client - s3_client = boto3.client('s3') - + s3_client = boto3.client("s3") + try: # Try to get the .zattrs file which should contain resolution metadata zattrs_key = f"{prefix}/.zattrs" print(f"Reading {zattrs_key} from bucket {bucket}") response = s3_client.get_object(Bucket=bucket, Key=zattrs_key) - zattrs_content = response['Body'].read().decode('utf-8') + zattrs_content = response["Body"].read().decode("utf-8") zattrs = json.loads(zattrs_content) - + # Look for resolution in multiscales metadata - if 'multiscales' in zattrs and zattrs['multiscales']: - multiscale = zattrs['multiscales'][0] - - if 'axes' in multiscale: - axes = multiscale['axes'] - axes_map = {axis['name']: i for i, axis in enumerate(axes)} - - z_idx = axes_map.get('z') - y_idx = axes_map.get('y') - x_idx = axes_map.get('x') - - if 'datasets' in multiscale and multiscale['datasets']: - dataset = multiscale['datasets'][0] - if 'coordinateTransformations' in dataset: - for transform in dataset['coordinateTransformations']: - if transform.get('type') == 'scale': - scale = transform['scale'] - - if all(idx is not None for idx in [z_idx, y_idx, x_idx]): - print(f"Found resolution from multiscales: {[scale[z_idx], scale[y_idx], scale[x_idx]]}") - return [scale[z_idx], scale[y_idx], scale[x_idx]] - + if "multiscales" in zattrs and zattrs["multiscales"]: + multiscale = zattrs["multiscales"][0] + + if "axes" in multiscale: + axes = multiscale["axes"] + axes_map = {axis["name"]: i for i, axis in enumerate(axes)} + + z_idx = axes_map.get("z") + y_idx = axes_map.get("y") + x_idx = axes_map.get("x") + + if "datasets" in multiscale and multiscale["datasets"]: + dataset = multiscale["datasets"][0] + if "coordinateTransformations" in dataset: + for transform in dataset["coordinateTransformations"]: + if transform.get("type") == "scale": + scale = transform["scale"] + + if all( + idx is not None + for idx in [z_idx, y_idx, x_idx] + ): + print( + f"Found resolution from multiscales: {[scale[z_idx], scale[y_idx], scale[x_idx]]}" + ) + return [ + scale[z_idx], + scale[y_idx], + scale[x_idx], + ] + # Check for direct resolution attribute - if 'resolution' in zattrs: + if "resolution" in zattrs: print(f"Found direct resolution attribute: {zattrs['resolution']}") - return list(zattrs['resolution']) - + return list(zattrs["resolution"]) + except Exception as e: print(f"Error reading .zattrs: {str(e)}") - + print(f"Using default resolution for {s3_path}") return [1.0, 1.0, 1.0] @@ -441,29 +480,26 @@ def wavelength_to_hex_pure_colours(wavelength: int) -> int: # Color map wavelength/hex pairs are generated # by sampling along a CIE diagram arc. color_map = { - 0: 0xFFFFFF, #white - 1: 0x00FF00, # Blue - 2: 0xFF0000, # Red - 3: 0x0000FF, # Blue - 4: 0x00FFFF, #cyan - 5: 0xFF00FF, # magenta #638 - - - #420: 0xFFFFFF, #white #405 - #490: 0x5DF8D6, # Green #488 - #520: 0x4B90FE, # Blue #515 - #570: 0xE9EC02, # Yellow #561 - #600: 0xF00050, # Pink #594 - #650: 0xF0121E, # Red #638 - - 420: 0xFFFFFF, #white #405 + 0: 0xFFFFFF, # white + 1: 0x00FF00, # Blue + 2: 0xFF0000, # Red + 3: 0x0000FF, # Blue + 4: 0x00FFFF, # cyan + 5: 0xFF00FF, # magenta #638 + # 420: 0xFFFFFF, #white #405 + # 490: 0x5DF8D6, # Green #488 + # 520: 0x4B90FE, # Blue #515 + # 570: 0xE9EC02, # Yellow #561 + # 600: 0xF00050, # Pink #594 + # 650: 0xF0121E, # Red #638 + 420: 0xFFFFFF, # white #405 490: 0x00FF00, # Green #488 520: 0xFF0000, # Red #515 570: 0x0000FF, # Blue #561 - 600: 0x00FFFF, #cyan #594 #600: 0xFFF000, # Orange #594 #or should be cyan? + 600: 0x00FFFF, # cyan #594 #600: 0xFFF000, # Orange #594 #or should be cyan? 650: 0xFF00FF, # magenta #638 } for ub, hex_val in color_map.items(): if wavelength < ub: # Exclusive return hex_val - return hex_val # hex_val is set to the last color in for loop \ No newline at end of file + return hex_val # hex_val is set to the last color in for loop diff --git a/src/see_spot/s3_handler.py b/src/see_spot/s3_handler.py index 61dfec4..50783aa 100644 --- a/src/see_spot/s3_handler.py +++ b/src/see_spot/s3_handler.py @@ -9,16 +9,18 @@ from botocore.exceptions import ClientError from pathlib import Path from typing import Optional, Union + # Configure logging logger = logging.getLogger(__name__) + class S3Handler: """Handler for S3 operations.""" - + def __init__(self, bucket_name=None): """ Initialize S3 client using environment credentials. - + Args: bucket_name (str, optional): Default S3 bucket name. """ @@ -26,149 +28,142 @@ def __init__(self, bucket_name=None): self.s3_resource = None self.bucket_name = bucket_name self.init_s3_client() - + def init_s3_client(self): """Initialize the S3 client using credentials from environment variables.""" try: - # Create S3 client - boto3 will automatically use AWS_ACCESS_KEY_ID, + # Create S3 client - boto3 will automatically use AWS_ACCESS_KEY_ID, # AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN from environment - self.s3_client = boto3.client('s3') - self.s3_resource = boto3.resource('s3') + self.s3_client = boto3.client("s3") + self.s3_resource = boto3.resource("s3") logger.info("S3 client initialized successfully") except Exception as e: logger.error(f"Failed to initialize S3 client: {e}") raise - + def test_connection(self, bucket_name=None): """ Test connection to S3 by listing objects in a bucket. - + Args: bucket_name (str, optional): S3 bucket name to test. Uses default if not provided. - + Returns: dict: Test results with success status and message """ bucket = bucket_name or self.bucket_name - + if not bucket: return { "success": False, - "message": "No bucket name provided for test" + "message": "No bucket name provided for test", } - + try: # Try to list objects (limited to 5 for test) - response = self.s3_client.list_objects_v2( - Bucket=bucket, - MaxKeys=5 - ) - + response = self.s3_client.list_objects_v2(Bucket=bucket, MaxKeys=5) + # Check if we can access the bucket - if 'Contents' in response: - object_count = len(response['Contents']) - objects = [obj['Key'] for obj in response['Contents']] - + if "Contents" in response: + object_count = len(response["Contents"]) + objects = [obj["Key"] for obj in response["Contents"]] + return { "success": True, "message": f"Successfully connected to bucket '{bucket}'", "object_count": object_count, - "sample_objects": objects + "sample_objects": objects, } else: return { "success": True, - "message": f"Successfully connected to bucket '{bucket}' but it appears to be empty" + "message": f"Successfully connected to bucket '{bucket}' but it appears to be empty", } - + except ClientError as e: - error_code = e.response['Error']['Code'] - error_message = e.response['Error']['Message'] - - if error_code == 'NoSuchBucket': + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + + if error_code == "NoSuchBucket": return { "success": False, - "message": f"Bucket '{bucket}' does not exist" + "message": f"Bucket '{bucket}' does not exist", } - elif error_code == 'AccessDenied': + elif error_code == "AccessDenied": return { "success": False, - "message": f"Access denied to bucket '{bucket}'. Check your credentials and permissions." + "message": f"Access denied to bucket '{bucket}'. Check your credentials and permissions.", } else: return { "success": False, - "message": f"Error accessing bucket '{bucket}': {error_message}" + "message": f"Error accessing bucket '{bucket}': {error_message}", } except Exception as e: - return { - "success": False, - "message": f"Unexpected error: {str(e)}" - } + return {"success": False, "message": f"Unexpected error: {str(e)}"} def list_objects(self, bucket_name=None, prefix="", max_keys=1000): """ List objects in a bucket with optional prefix filtering. - + Args: bucket_name (str, optional): S3 bucket name. Uses default if not provided. prefix (str, optional): Filter objects by prefix max_keys (int, optional): Maximum number of keys to return - + Returns: list: List of object keys """ bucket = bucket_name or self.bucket_name - + if not bucket: logger.error("No bucket name provided") return [] - + try: - paginator = self.s3_client.get_paginator('list_objects_v2') + paginator = self.s3_client.get_paginator("list_objects_v2") objects = [] - + # Paginate through results for page in paginator.paginate( Bucket=bucket, Prefix=prefix, - PaginationConfig={"MaxItems": max_keys} + PaginationConfig={"MaxItems": max_keys}, ): - if 'Contents' in page: - for obj in page['Contents']: - objects.append(obj['Key']) - + if "Contents" in page: + for obj in page["Contents"]: + objects.append(obj["Key"]) + return objects - + except Exception as e: logger.error(f"Error listing objects in bucket '{bucket}': {e}") return [] - + def get_object(self, key, bucket_name=None): """ Get an object from S3. - + Args: key (str): Object key bucket_name (str, optional): S3 bucket name. Uses default if not provided. - + Returns: bytes: Object data or None if error """ bucket = bucket_name or self.bucket_name - + if not bucket: logger.error("No bucket name provided") return None - + try: - response = self.s3_client.get_object( - Bucket=bucket, - Key=key - ) - return response['Body'].read() + response = self.s3_client.get_object(Bucket=bucket, Key=key) + return response["Body"].read() except Exception as e: - logger.error(f"Error getting object '{key}' from bucket '{bucket}': {e}") + logger.error( + f"Error getting object '{key}' from bucket '{bucket}': {e}" + ) return None def get_object_metadata(self, key, bucket_name=None): @@ -190,37 +185,37 @@ def get_object_metadata(self, key, bucket_name=None): try: # Use head_object to get metadata without downloading the body - response = self.s3_client.head_object( - Bucket=bucket, - Key=key - ) + response = self.s3_client.head_object(Bucket=bucket, Key=key) # Return relevant metadata return { - 'ContentLength': response.get('ContentLength'), - 'LastModified': response.get('LastModified'), - 'ContentType': response.get('ContentType'), - 'ETag': response.get('ETag') + "ContentLength": response.get("ContentLength"), + "LastModified": response.get("LastModified"), + "ContentType": response.get("ContentType"), + "ETag": response.get("ETag") # Add other metadata fields from response if needed } except ClientError as e: # Handle common errors like Not Found - if e.response['Error']['Code'] == '404': - logger.warning(f"Object '{key}' not found in bucket '{bucket}'.") + if e.response["Error"]["Code"] == "404": + logger.warning( + f"Object '{key}' not found in bucket '{bucket}'." + ) else: - logger.error(f"Error getting metadata for object '{key}' from bucket '{bucket}': {e}") + logger.error( + f"Error getting metadata for object '{key}' from bucket '{bucket}': {e}" + ) return None except Exception as e: logger.error(f"Unexpected error getting metadata for '{key}': {e}") return None - def download_file( self, key: str, bucket_name: Optional[str] = None, local_path: Optional[Union[str, Path]] = None, use_cache: bool = True, - cache_dir: Union[str, Path] = '/s3-cache' + cache_dir: Union[str, Path] = "/s3-cache", ) -> Optional[Path]: """ Downloads a file from S3, optionally using a local cache. @@ -244,39 +239,53 @@ def download_file( return None if self.s3_client is None: - logger.error("S3 client is not initialized.") - return None + logger.error("S3 client is not initialized.") + return None effective_local_path: Path is_cache_path = False if local_path: # User specified an exact download location - effective_local_path = Path(local_path).resolve() # Resolve to absolute path - logger.info(f"Direct download requested to: {effective_local_path}") - use_cache = False # Explicit path overrides cache usage check + effective_local_path = Path( + local_path + ).resolve() # Resolve to absolute path + logger.info( + f"Direct download requested to: {effective_local_path}" + ) + use_cache = False # Explicit path overrides cache usage check else: # Construct path within the cache directory is_cache_path = True - base_cache_dir = Path(cache_dir).resolve() # Resolve cache dir path + base_cache_dir = Path( + cache_dir + ).resolve() # Resolve cache dir path # Combine cache base, bucket, and key to form path # Ensure key is treated as relative within the bucket folder - safe_key_part = key.lstrip('/') + safe_key_part = key.lstrip("/") effective_local_path = base_cache_dir / bucket / safe_key_part logger.debug(f"Cache path constructed: {effective_local_path}") # Check cache if requested and applicable - if use_cache and effective_local_path.is_file(): # Check if it's actually a file - logger.info(f"Cache hit! Using local file: {effective_local_path}") + if ( + use_cache and effective_local_path.is_file() + ): # Check if it's actually a file + logger.info( + f"Cache hit! Using local file: {effective_local_path}" + ) # Optional: Could add check here to compare S3 etag/last_modified # with cached file metadata if cache invalidation is needed. return effective_local_path elif use_cache: - logger.info(f"Cache miss or not a file: {effective_local_path}") + logger.info( + f"Cache miss or not a file: {effective_local_path}" + ) # If not using cache or file not found, proceed to download # --- Download required --- - logger.info(f"Attempting to download s3://{bucket}/{key} to {effective_local_path}") + logger.info( + f"Attempting to download s3://{bucket}/{key} to {effective_local_path}" + ) try: # Ensure parent directory exists @@ -286,20 +295,28 @@ def download_file( self.s3_client.download_file( Bucket=bucket, Key=key, - Filename=str(effective_local_path) # download_file expects a string path + Filename=str( + effective_local_path + ), # download_file expects a string path ) logger.info(f"Successfully downloaded to: {effective_local_path}") return effective_local_path except ClientError as e: # Check for specific errors like Not Found - error_code = e.response.get('Error', {}).get('Code') - if error_code == '404' or 'NoSuchKey' in str(e): # Check common variations - logger.error(f"Error: Object not found on S3: s3://{bucket}/{key}") - elif error_code == 'NoSuchBucket': - logger.error(f"Error: Bucket not found: {bucket}") + error_code = e.response.get("Error", {}).get("Code") + if error_code == "404" or "NoSuchKey" in str( + e + ): # Check common variations + logger.error( + f"Error: Object not found on S3: s3://{bucket}/{key}" + ) + elif error_code == "NoSuchBucket": + logger.error(f"Error: Bucket not found: {bucket}") else: - logger.error(f"S3 ClientError during download for key '{key}': {e}") + logger.error( + f"S3 ClientError during download for key '{key}': {e}" + ) # Consider removing partially downloaded file if download_file guarantees creation # Check if file exists and maybe size is 0 before unlinking # try: @@ -310,30 +327,38 @@ def download_file( # logger.error(f"Error removing incomplete file {effective_local_path}: {unlink_err}") return None except Exception as e: - logger.error(f"Unexpected error during download of '{key}': {e}", exc_info=True) + logger.error( + f"Unexpected error during download of '{key}': {e}", + exc_info=True, + ) return None + # Create a global instance for easy access # s3_handler = S3Handler('aind-open-data') -s3_handler = S3Handler('codeocean-s3resultsbucket-1182nktl2bh9f') +s3_handler = S3Handler("codeocean-s3resultsbucket-1182nktl2bh9f") + # Test function that can be called to verify connection def test_s3_connection(): """Test S3 connection and print results.""" results = s3_handler.test_connection() - + if results["success"]: print(f"✅ {results['message']}") - + if "sample_objects" in results: - print(f"\nFound {results['object_count']} objects. Sample objects:") + print( + f"\nFound {results['object_count']} objects. Sample objects:" + ) for obj in results["sample_objects"]: print(f" - {obj}") else: print(f"❌ {results['message']}") - + return results + # For direct testing if __name__ == "__main__": - test_s3_connection() \ No newline at end of file + test_s3_connection() diff --git a/src/see_spot/s3_utils.py b/src/see_spot/s3_utils.py index f65f655..10b2ae5 100644 --- a/src/see_spot/s3_utils.py +++ b/src/see_spot/s3_utils.py @@ -11,75 +11,87 @@ import logging import tempfile import os -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) def find_processing_manifest(bucket: str, dataset_name: str) -> Optional[str]: """ Find the processing_manifest.json file in either the top level or derived folder. - + Args: bucket: S3 bucket name dataset_name: Dataset name/prefix - + Returns: Full S3 key to the manifest file, or None if not found """ # Try both possible locations possible_paths = [ f"{dataset_name}/processing_manifest.json", # Top level - f"{dataset_name}/derived/processing_manifest.json" # Derived folder + f"{dataset_name}/derived/processing_manifest.json", # Derived folder ] - - logger.info(f"Searching for processing_manifest.json in dataset '{dataset_name}'") - + + logger.info( + f"Searching for processing_manifest.json in dataset '{dataset_name}'" + ) + for manifest_key in possible_paths: logger.info(f"Checking: s3://{bucket}/{manifest_key}") try: # Try to get metadata (faster than downloading) - metadata = s3_handler.get_object_metadata(key=manifest_key, bucket_name=bucket) + metadata = s3_handler.get_object_metadata( + key=manifest_key, bucket_name=bucket + ) if metadata is not None: logger.info(f"Found processing manifest at: {manifest_key}") return manifest_key except Exception as e: logger.debug(f"Manifest not found at {manifest_key}: {e}") continue - - logger.warning(f"Could not find processing_manifest.json in any expected location for dataset '{dataset_name}'") + + logger.warning( + f"Could not find processing_manifest.json in any expected location for dataset '{dataset_name}'" + ) return None + def optimize_dtypes(df: pl.DataFrame) -> pl.DataFrame: """Optimize DataFrame dtypes to reduce memory usage. - + Args: df: Input Polars DataFrame - + Returns: DataFrame with optimized dtypes """ logger.info("Optimizing data types for memory efficiency...") - + # Define columns that should remain as specific types - string_cols = ['chan', 'unmixed_chan', 'cell_id'] - int_cols = ['spot_id', 'chan_spot_id', 'round'] - bool_cols = ['valid_spot', 'reassigned', 'unmixed_removed'] - + string_cols = ["chan", "unmixed_chan", "cell_id"] + int_cols = ["spot_id", "chan_spot_id", "round"] + bool_cols = ["valid_spot", "reassigned", "unmixed_removed"] + # Get current columns current_cols = df.columns - + # Build casting dictionary cast_dict = {} - + for col in current_cols: if col in string_cols: cast_dict[col] = pl.Utf8 elif col in int_cols: # Use smaller int types where possible - if col in ['round']: + if col in ["round"]: cast_dict[col] = pl.Int8 # rounds typically 1-10 else: - cast_dict[col] = pl.Int32 # spot_ids can be large but usually fit in Int32 + cast_dict[ + col + ] = pl.Int32 # spot_ids can be large but usually fit in Int32 elif col in bool_cols: cast_dict[col] = pl.Boolean elif df[col].dtype in [pl.Float64, pl.Float32]: @@ -88,11 +100,15 @@ def optimize_dtypes(df: pl.DataFrame) -> pl.DataFrame: max_val = df[col].max() min_val = df[col].min() if max_val is not None and min_val is not None: - if abs(max_val) < 3.4e38 and abs(min_val) < 3.4e38: # Float32 range + if ( + abs(max_val) < 3.4e38 and abs(min_val) < 3.4e38 + ): # Float32 range cast_dict[col] = pl.Float32 else: - cast_dict[col] = pl.Float64 # Keep as Float64 if values are too large - + cast_dict[ + col + ] = pl.Float64 # Keep as Float64 if values are too large + # Apply casting if cast_dict: df_optimized = df.cast(cast_dict) @@ -112,8 +128,8 @@ def merge_spots_tables(spots_mixed, spots_unmixed): Returns: pl.DataFrame: Merged DataFrame with unmixed_removed column """ - mixed_clean = spots_mixed.drop('spot_id', strict=False) - unmixed_clean = spots_unmixed.drop('spot_id', strict=False) + mixed_clean = spots_mixed.drop("spot_id", strict=False) + unmixed_clean = spots_unmixed.drop("spot_id", strict=False) # Get columns that are unique to unmixed table mixed_cols = set(mixed_clean.columns) @@ -121,33 +137,41 @@ def merge_spots_tables(spots_mixed, spots_unmixed): unique_unmixed_cols = list(unmixed_cols - mixed_cols) # Keep only merge keys and unique columns from unmixed - merge_keys = ['chan', 'chan_spot_id'] + merge_keys = ["chan", "chan_spot_id"] select_cols = merge_keys + unique_unmixed_cols unmixed_subset = unmixed_clean.select(select_cols) - merged = mixed_clean.join(unmixed_subset, on=merge_keys, how='left') + merged = mixed_clean.join(unmixed_subset, on=merge_keys, how="left") # Add unmixed_removed column - True where any unique unmixed column is null if unique_unmixed_cols: # Create condition: all unique unmixed columns are null - null_conditions = [pl.col(col).is_null() for col in unique_unmixed_cols] + null_conditions = [ + pl.col(col).is_null() for col in unique_unmixed_cols + ] all_null = pl.fold(True, lambda acc, x: acc & x, null_conditions) merged = merged.with_columns(unmixed_removed=all_null) else: merged = merged.with_columns(unmixed_removed=pl.lit(False)) - merged_with_id = merged.with_row_index(name='spot_id', offset=1) + merged_with_id = merged.with_row_index(name="spot_id", offset=1) merged_optimized = optimize_dtypes(merged_with_id) logger.info(f"Merge completed. Final shape: {merged_optimized.shape}") return merged_optimized -def find_mixed_spots_file(bucket: str, prefix: str, pattern: str) -> Optional[str]: +def find_mixed_spots_file( + bucket: str, prefix: str, pattern: str +) -> Optional[str]: """Finds the first mixed spots file matching the pattern within the prefix.""" - logger.info(f"Searching for mixed spots pattern '{pattern}' in bucket '{bucket}' with prefix '{prefix}'...") + logger.info( + f"Searching for mixed spots pattern '{pattern}' in bucket '{bucket}' with prefix '{prefix}'..." + ) try: # List objects - consider increasing max_keys if many files share the prefix - objects = s3_handler.list_objects(bucket_name=bucket, prefix=prefix, max_keys=200) + objects = s3_handler.list_objects( + bucket_name=bucket, prefix=prefix, max_keys=200 + ) if not objects: logger.warning(f"No objects found with prefix '{prefix}'.") return None @@ -161,16 +185,22 @@ def find_mixed_spots_file(bucket: str, prefix: str, pattern: str) -> Optional[st found_files.append(key) if not found_files: - logger.warning(f"No mixed spots files matching pattern '{pattern}' found within the first {len(objects)} objects listed under prefix '{prefix}'.") + logger.warning( + f"No mixed spots files matching pattern '{pattern}' found within the first {len(objects)} objects listed under prefix '{prefix}'." + ) return None if len(found_files) > 1: - logger.warning(f"Multiple mixed spots files ({len(found_files)}) matching pattern found. Using the first one: {found_files[0]}") + logger.warning( + f"Multiple mixed spots files ({len(found_files)}) matching pattern found. Using the first one: {found_files[0]}" + ) - return found_files[0] # Return the full key of the first match + return found_files[0] # Return the full key of the first match except Exception as e: - logger.error(f"Error listing or searching objects: {e}", exc_info=True) # Log traceback + logger.error( + f"Error listing or searching objects: {e}", exc_info=True + ) # Log traceback return None @@ -178,34 +208,34 @@ def get_base_pattern_from_unmixed(unmixed_key: str) -> str: """Extract the round pattern (e.g., R3) from unmixed_spots_R3_minDist_3.pkl to find mixed_spots_R3.pkl""" filename = Path(unmixed_key).name # Extract pattern like R3 from unmixed_spots_R3_minDist_3.pkl - parts = filename.split('_') + parts = filename.split("_") for part in parts: - if part.startswith('R') and part[1:].isdigit(): + if part.startswith("R") and part[1:].isdigit(): return part - return 'R3' # Default fallback + return "R3" # Default fallback def load_and_merge_spots_from_s3( - bucket: str, - dataset_name: str, - unmixed_spots_prefix: str, - valid_spots_only: bool = True + bucket: str, + dataset_name: str, + unmixed_spots_prefix: str, + valid_spots_only: bool = True, ) -> Optional[pl.DataFrame]: """ Load both mixed and unmixed spots files, merge them, cache as parquet, and return merged DataFrame. - + Args: bucket: S3 bucket name dataset_name: Dataset name (used for parquet filename) unmixed_spots_prefix: S3 prefix where spots files are located valid_spots_only: If True, filter to only valid spots. If False, return all spots. - + Returns: Merged Polars DataFrame or None if loading failed """ cache_dir = Path("/s3-cache") / bucket / dataset_name parquet_file = cache_dir / f"{dataset_name}.parquet" - + # Check if merged parquet file already exists if parquet_file.exists(): logger.info(f"Loading merged data from cached parquet: {parquet_file}") @@ -214,40 +244,54 @@ def load_and_merge_spots_from_s3( # Optimize data types and filter for valid spots df_optimized = optimize_dtypes(df) if valid_spots_only: - df_final = df_optimized.filter(pl.col('valid_spot')) - logger.info(f"Loaded DataFrame from parquet (valid spots only). Shape: {df_final.shape}") + df_final = df_optimized.filter(pl.col("valid_spot")) + logger.info( + f"Loaded DataFrame from parquet (valid spots only). Shape: {df_final.shape}" + ) else: df_final = df_optimized - logger.info(f"Loaded DataFrame from parquet (all spots). Shape: {df_final.shape}") + logger.info( + f"Loaded DataFrame from parquet (all spots). Shape: {df_final.shape}" + ) return df_final except Exception as e: logger.error(f"Error loading parquet file: {e}", exc_info=True) # Fall through to regenerate the file - + # Need to download, merge, and cache - logger.info(f"Parquet file not found or corrupted. Downloading and merging spots files...") - + logger.info( + f"Parquet file not found or corrupted. Downloading and merging spots files..." + ) + # 1. Find unmixed spots file - unmixed_key = find_unmixed_spots_file(bucket, unmixed_spots_prefix, "unmixed_spots_*.pkl") + unmixed_key = find_unmixed_spots_file( + bucket, unmixed_spots_prefix, "unmixed_spots_*.pkl" + ) if not unmixed_key: - logger.error(f"Could not find unmixed spots file in {unmixed_spots_prefix}") + logger.error( + f"Could not find unmixed spots file in {unmixed_spots_prefix}" + ) return None - + # 2. Find mixed spots file based on pattern from unmixed file base_pattern = get_base_pattern_from_unmixed(unmixed_key) mixed_pattern = f"mixed_spots_{base_pattern}.pkl" - mixed_key = find_mixed_spots_file(bucket, unmixed_spots_prefix, mixed_pattern) + mixed_key = find_mixed_spots_file( + bucket, unmixed_spots_prefix, mixed_pattern + ) if not mixed_key: - logger.error(f"Could not find mixed spots file matching pattern {mixed_pattern} in {unmixed_spots_prefix}") + logger.error( + f"Could not find mixed spots file matching pattern {mixed_pattern} in {unmixed_spots_prefix}" + ) return None - + logger.info(f"Found unmixed file: {unmixed_key}") logger.info(f"Found mixed file: {mixed_key}") - + # 3. Download both files to /tmp with tempfile.TemporaryDirectory() as tmp_dir: tmp_dir_path = Path(tmp_dir) - + # Download unmixed file unmixed_tmp_path = tmp_dir_path / f"unmixed_{os.getpid()}.pkl" logger.info(f"Downloading unmixed file to {unmixed_tmp_path}") @@ -255,12 +299,12 @@ def load_and_merge_spots_from_s3( key=unmixed_key, bucket_name=bucket, local_path=str(unmixed_tmp_path), - use_cache=False + use_cache=False, ) if not unmixed_local: logger.error("Failed to download unmixed spots file") return None - + # Download mixed file mixed_tmp_path = tmp_dir_path / f"mixed_{os.getpid()}.pkl" logger.info(f"Downloading mixed file to {mixed_tmp_path}") @@ -268,19 +312,19 @@ def load_and_merge_spots_from_s3( key=mixed_key, bucket_name=bucket, local_path=str(mixed_tmp_path), - use_cache=False + use_cache=False, ) if not mixed_local: logger.error("Failed to download mixed spots file") return None - + # 4. Load both DataFrames using Polars (via pandas for pickle support) try: logger.info("Loading unmixed spots DataFrame...") df_unmixed_pd = pd.read_pickle(unmixed_local) df_unmixed = pl.from_pandas(df_unmixed_pd) logger.info(f"Loaded unmixed DataFrame. Shape: {df_unmixed.shape}") - + logger.info("Loading mixed spots DataFrame...") df_mixed_pd = pd.read_pickle(mixed_local) df_mixed = pl.from_pandas(df_mixed_pd) @@ -288,7 +332,7 @@ def load_and_merge_spots_from_s3( except Exception as e: logger.error(f"Error loading pickle files: {e}", exc_info=True) return None - + # 5. Merge the DataFrames try: logger.info("Merging DataFrames...") @@ -302,30 +346,40 @@ def load_and_merge_spots_from_s3( try: # Ensure cache directory exists cache_dir.mkdir(parents=True, exist_ok=True) - + logger.info(f"Saving merged DataFrame to parquet: {parquet_file}") - df_merged.write_parquet(parquet_file, compression='snappy') + df_merged.write_parquet(parquet_file, compression="snappy") logger.info(f"Successfully saved merged data to {parquet_file}") except Exception as e: logger.error(f"Error saving parquet file: {e}", exc_info=True) # Continue anyway - we have the data in memory - + # 8. Filter for valid spots (if requested) and return if valid_spots_only: - df_final = df_merged.filter(pl.col('valid_spot')) - logger.info(f"Returning valid spots DataFrame. Shape: {df_final.shape}") + df_final = df_merged.filter(pl.col("valid_spot")) + logger.info( + f"Returning valid spots DataFrame. Shape: {df_final.shape}" + ) else: df_final = df_merged - logger.info(f"Returning all spots DataFrame. Shape: {df_final.shape}") + logger.info( + f"Returning all spots DataFrame. Shape: {df_final.shape}" + ) return df_final -def find_unmixed_spots_file(bucket: str, prefix: str, pattern: str) -> Optional[str]: +def find_unmixed_spots_file( + bucket: str, prefix: str, pattern: str +) -> Optional[str]: """Finds the first S3 object key matching the pattern within the prefix.""" - logger.info(f"Searching for pattern '{pattern}' in bucket '{bucket}' with prefix '{prefix}'...") + logger.info( + f"Searching for pattern '{pattern}' in bucket '{bucket}' with prefix '{prefix}'..." + ) try: # List objects - consider increasing max_keys if many files share the prefix - objects = s3_handler.list_objects(bucket_name=bucket, prefix=prefix, max_keys=200) + objects = s3_handler.list_objects( + bucket_name=bucket, prefix=prefix, max_keys=200 + ) if not objects: logger.warning(f"No objects found with prefix '{prefix}'.") return None @@ -339,23 +393,32 @@ def find_unmixed_spots_file(bucket: str, prefix: str, pattern: str) -> Optional[ found_files.append(key) if not found_files: - logger.warning(f"No files matching pattern '{pattern}' found within the first {len(objects)} objects listed under prefix '{prefix}'.") + logger.warning( + f"No files matching pattern '{pattern}' found within the first {len(objects)} objects listed under prefix '{prefix}'." + ) # Consider adding logic here to list more objects if needed (pagination) return None if len(found_files) > 1: - logger.warning(f"Multiple files ({len(found_files)}) matching pattern found. Using the first one: {found_files[0]}") + logger.warning( + f"Multiple files ({len(found_files)}) matching pattern found. Using the first one: {found_files[0]}" + ) - return found_files[0] # Return the full key of the first match + return found_files[0] # Return the full key of the first match except Exception as e: - logger.error(f"Error listing or searching objects: {e}", exc_info=True) # Log traceback + logger.error( + f"Error listing or searching objects: {e}", exc_info=True + ) # Log traceback return None -def find_related_files(bucket: str, prefix: str, spots_file: str) -> Dict[str, str]: + +def find_related_files( + bucket: str, prefix: str, spots_file: str +) -> Dict[str, str]: """ Find related ratios.txt and summary_stats.csv files based on the unmixed spots file pattern. - + Parameters: ----------- bucket: str @@ -364,55 +427,58 @@ def find_related_files(bucket: str, prefix: str, spots_file: str) -> Dict[str, s S3 prefix (folder path) spots_file: str Full key of the spots file that was found - + Returns: -------- Dict[str, str] Dictionary with keys 'ratios' and 'summary_stats' pointing to file keys if found """ - result = {'ratios': None, 'summary_stats': None} - + result = {"ratios": None, "summary_stats": None} + try: # Extract base filename without extension spots_filename = Path(spots_file).stem - base_pattern = spots_filename.replace('unmixed_spots', '*') - + base_pattern = spots_filename.replace("unmixed_spots", "*") + # List objects in the same directory - objects = s3_handler.list_objects(bucket_name=bucket, prefix=prefix, max_keys=200) + objects = s3_handler.list_objects( + bucket_name=bucket, prefix=prefix, max_keys=200 + ) print(objects) - + # Look for ratios.txt for key in objects: filename = Path(key).name - if '_ratios.txt' in filename: + if "_ratios.txt" in filename: logger.info(f"Found ratios file: {key}") - result['ratios'] = key + result["ratios"] = key break - + # Look for summary_stats.csv for key in objects: filename = Path(key).name - if 'summary_stats.csv' in filename: + if "summary_stats.csv" in filename: logger.info(f"Found summary stats file: {key}") - result['summary_stats'] = key + result["summary_stats"] = key break - + except Exception as e: logger.error(f"Error finding related files: {e}", exc_info=True) - + return result + def load_ratios_from_s3(bucket: str, key: str) -> Optional[np.ndarray]: """ Load a ratios.txt file from S3. - + Parameters: ----------- bucket: str S3 bucket name key: str S3 key for the ratios file - + Returns: -------- Optional[np.ndarray] @@ -421,43 +487,46 @@ def load_ratios_from_s3(bucket: str, key: str) -> Optional[np.ndarray]: if not key: logger.warning("No ratios file key provided") return None - + logger.info(f"Loading ratios from s3://{bucket}/{key}") - + try: # Download the file content content = s3_handler.get_object(key=key, bucket_name=bucket) if content is None: logger.error(f"Failed to get object content for {key}") return None - + # Parse the content as a matrix of numbers - content_str = content.decode('utf-8') - rows = content_str.strip().split('\n') + content_str = content.decode("utf-8") + rows = content_str.strip().split("\n") ratios_matrix = [] - + for row in rows: # Split by tabs and convert to integers values = [int(val) for val in row.strip().split()] ratios_matrix.append(values) - + return np.array(ratios_matrix) - + except Exception as e: logger.error(f"Error loading ratios file: {e}", exc_info=True) return None -def load_summary_stats_from_s3(bucket: str, key: str) -> Optional[pd.DataFrame]: + +def load_summary_stats_from_s3( + bucket: str, key: str +) -> Optional[pd.DataFrame]: """ Load a summary_stats.csv file from S3. - + Parameters: ----------- bucket: str S3 bucket name key: str S3 key for the summary stats file - + Returns: -------- Optional[pd.DataFrame] @@ -466,26 +535,26 @@ def load_summary_stats_from_s3(bucket: str, key: str) -> Optional[pd.DataFrame]: if not key: logger.warning("No summary stats file key provided") return None - + logger.info(f"Loading summary stats from s3://{bucket}/{key}") - + try: # Download the file content content = s3_handler.get_object(key=key, bucket_name=bucket) if content is None: logger.error(f"Failed to get object content for {key}") return None - + # Parse CSV df = pd.read_csv(io.BytesIO(content)) - + # Add 'removed_spots' column - if 'total_spots' in df.columns and 'kept_spots' in df.columns: - df['removed_spots'] = df['total_spots'] - df['kept_spots'] - df['unchanged_spots'] = df['kept_spots'] - df['reassigned_spots'] + if "total_spots" in df.columns and "kept_spots" in df.columns: + df["removed_spots"] = df["total_spots"] - df["kept_spots"] + df["unchanged_spots"] = df["kept_spots"] - df["reassigned_spots"] return df - + except Exception as e: logger.error(f"Error loading summary stats file: {e}", exc_info=True) return None @@ -497,28 +566,41 @@ def get_s3_object_size(bucket: str, key: str) -> Optional[int]: try: # Use the get_object_metadata method (assumes it was added to S3Handler) # Check if the method exists before calling - if not hasattr(s3_handler, 'get_object_metadata'): - logger.error("Error: S3Handler instance does not have 'get_object_metadata' method. Please add it to s3_handler.py.") - return None + if not hasattr(s3_handler, "get_object_metadata"): + logger.error( + "Error: S3Handler instance does not have 'get_object_metadata' method. Please add it to s3_handler.py." + ) + return None metadata = s3_handler.get_object_metadata(key=key, bucket_name=bucket) - if metadata and 'ContentLength' in metadata and metadata['ContentLength'] is not None: - size_bytes = metadata['ContentLength'] + if ( + metadata + and "ContentLength" in metadata + and metadata["ContentLength"] is not None + ): + size_bytes = metadata["ContentLength"] # Convert size to MB for readability size_mb = size_bytes / (1024 * 1024) logger.info(f"Object size: {size_bytes} bytes ({size_mb:.2f} MB)") return size_bytes else: - logger.warning(f"Could not retrieve valid 'ContentLength' metadata for {key}. Metadata received: {metadata}") + logger.warning( + f"Could not retrieve valid 'ContentLength' metadata for {key}. Metadata received: {metadata}" + ) return None except Exception as e: - logger.error(f"Error getting object metadata for {key}: {e}", exc_info=True) + logger.error( + f"Error getting object metadata for {key}: {e}", exc_info=True + ) return None + def load_pkl_from_s3(bucket: str, key: str) -> Optional[pd.DataFrame]: """Loads a pickle file from S3 into a pandas DataFrame, using local caching.""" - logger.info(f"Attempting to load pickle file: s3://{bucket}/{key} (using cache)") + logger.info( + f"Attempting to load pickle file: s3://{bucket}/{key} (using cache)" + ) # 1. Get object size for context (optional, still useful) get_s3_object_size(bucket, key) @@ -533,11 +615,13 @@ def load_pkl_from_s3(bucket: str, key: str) -> Optional[pd.DataFrame]: if local_file_path is None: logger.error("Failed to download or retrieve file from cache.") return None - + logger.info(f"File available locally at: {local_file_path}") except Exception as e: - logger.error(f"Error during file download/cache check: {e}", exc_info=True) + logger.error( + f"Error during file download/cache check: {e}", exc_info=True + ) return None # 3. Load the pickle data using pandas from the local path @@ -546,22 +630,32 @@ def load_pkl_from_s3(bucket: str, key: str) -> Optional[pd.DataFrame]: try: df = pd.read_pickle(local_file_path) n_all = df.shape[0] - df = df[df['valid_spot'] == True] + df = df[df["valid_spot"] == True] n_valid = df.shape[0] logger.info(f"Successfully loaded DataFrame. Shape: {df.shape}") logger.info(f"Total spots: {n_all}, Valid spots: {n_valid}") return df except pd.errors.EmptyDataError: - logger.error(f"Error loading pickle from {local_file_path}: The file seems to be empty or contains no data.") - return None + logger.error( + f"Error loading pickle from {local_file_path}: The file seems to be empty or contains no data." + ) + return None except FileNotFoundError: - logger.error(f"Error loading pickle: Local file not found at {local_file_path} (should not happen if download succeeded)." ) - return None + logger.error( + f"Error loading pickle: Local file not found at {local_file_path} (should not happen if download succeeded)." + ) + return None except Exception as e: - logger.error(f"Error loading pickle data from {local_file_path}: {e}", exc_info=True) + logger.error( + f"Error loading pickle data from {local_file_path}: {e}", + exc_info=True, + ) return None -def load_processing_manifest_from_s3(bucket: str, key: str) -> Optional[Dict[str, Any]]: + +def load_processing_manifest_from_s3( + bucket: str, key: str +) -> Optional[Dict[str, Any]]: """ Loads a processing_manifest.json file from S3. @@ -591,13 +685,17 @@ def load_processing_manifest_from_s3(bucket: str, key: str) -> Optional[Dict[str return None # Parse the JSON content - manifest_data = json.loads(content.decode('utf-8')) - logger.info(f"Successfully loaded and parsed processing manifest: {key}") + manifest_data = json.loads(content.decode("utf-8")) + logger.info( + f"Successfully loaded and parsed processing manifest: {key}" + ) return manifest_data except json.JSONDecodeError as e: logger.error(f"Error decoding JSON from {key}: {e}", exc_info=True) return None except Exception as e: - logger.error(f"Error loading processing manifest file: {e}", exc_info=True) - return None \ No newline at end of file + logger.error( + f"Error loading processing manifest file: {e}", exc_info=True + ) + return None From 8c6aab56772a2ce27a3f954e901c86217e8d7335 Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 11:32:39 -0800 Subject: [PATCH 04/20] refactor: cleanup --- src/see_spot/app.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index 4d4ef88..942a012 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -52,7 +52,6 @@ "spot_channels_from_manifest": None, "sankey_data": None, # Cache Sankey data to avoid recalculation "unmixed_spots_filename": None # Store unmixed spots filename for neuroglancer logic - "unmixed_spots_filename": None # Store unmixed spots filename for neuroglancer logic } @@ -671,7 +670,7 @@ async def download_dataset(request: Request): status_code=404, content={ "error": "Spots data file not found", - "checked_path": f"s3://{S3_BUCKET}/{spots_key}unmixed_spots_*.pkl" + "checked_path": f"s3://{S3_BUCKET}/{spots_key}unmixed_spots_*.pkl"} ) From ab94721a9e63c6fc71518f682b6c5fc0f4b79185 Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 11:45:05 -0800 Subject: [PATCH 05/20] feat: add support for -1 round in base_pattern --- src/see_spot/s3_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/see_spot/s3_utils.py b/src/see_spot/s3_utils.py index 10b2ae5..feca955 100644 --- a/src/see_spot/s3_utils.py +++ b/src/see_spot/s3_utils.py @@ -210,7 +210,8 @@ def get_base_pattern_from_unmixed(unmixed_key: str) -> str: # Extract pattern like R3 from unmixed_spots_R3_minDist_3.pkl parts = filename.split("_") for part in parts: - if part.startswith("R") and part[1:].isdigit(): + # add support for R-1 (default round for datasets without metadata) + if part.startswith("R") and (part[1:].isdigit() or part[1:] == '-1'): return part return "R3" # Default fallback From 390d67fd37ce950b3f1fa45517efc4f50b12e30e Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 11:52:41 -0800 Subject: [PATCH 06/20] bugfix: change mixed pattern --- src/see_spot/s3_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/see_spot/s3_utils.py b/src/see_spot/s3_utils.py index feca955..85747f4 100644 --- a/src/see_spot/s3_utils.py +++ b/src/see_spot/s3_utils.py @@ -276,7 +276,7 @@ def load_and_merge_spots_from_s3( # 2. Find mixed spots file based on pattern from unmixed file base_pattern = get_base_pattern_from_unmixed(unmixed_key) - mixed_pattern = f"mixed_spots_{base_pattern}.pkl" + mixed_pattern = f"mixed_spots_{base_pattern}*.pkl" mixed_key = find_mixed_spots_file( bucket, unmixed_spots_prefix, mixed_pattern ) From adfa208a87d9ed42b1c1bcbdb557c955c17640b5 Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 11:57:46 -0800 Subject: [PATCH 07/20] feat: only search image_spot_spectral_unmixing top level --- src/see_spot/s3_utils.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/see_spot/s3_utils.py b/src/see_spot/s3_utils.py index 85747f4..19d9df4 100644 --- a/src/see_spot/s3_utils.py +++ b/src/see_spot/s3_utils.py @@ -163,7 +163,7 @@ def merge_spots_tables(spots_mixed, spots_unmixed): def find_mixed_spots_file( bucket: str, prefix: str, pattern: str ) -> Optional[str]: - """Finds the first mixed spots file matching the pattern within the prefix.""" + """Finds the first mixed spots file matching the pattern within the prefix (top level only, non-recursive).""" logger.info( f"Searching for mixed spots pattern '{pattern}' in bucket '{bucket}' with prefix '{prefix}'..." ) @@ -178,6 +178,13 @@ def find_mixed_spots_file( found_files = [] for key in objects: + # Skip objects in subdirectories - only check files at the top level + # Remove the prefix and check if there are any additional slashes + relative_path = key[len(prefix):] if key.startswith(prefix) else key + # If there's a slash in the relative path, it's in a subdirectory + if '/' in relative_path.lstrip('/'): + continue + # Use Pathlib to easily get the filename part of the key filename = Path(key).name if fnmatch.fnmatch(filename, pattern): @@ -372,7 +379,7 @@ def load_and_merge_spots_from_s3( def find_unmixed_spots_file( bucket: str, prefix: str, pattern: str ) -> Optional[str]: - """Finds the first S3 object key matching the pattern within the prefix.""" + """Finds the first S3 object key matching the pattern within the prefix (top level only, non-recursive).""" logger.info( f"Searching for pattern '{pattern}' in bucket '{bucket}' with prefix '{prefix}'..." ) @@ -387,6 +394,13 @@ def find_unmixed_spots_file( found_files = [] for key in objects: + # Skip objects in subdirectories - only check files at the top level + # Remove the prefix and check if there are any additional slashes + relative_path = key[len(prefix):] if key.startswith(prefix) else key + # If there's a slash in the relative path, it's in a subdirectory + if '/' in relative_path.lstrip('/'): + continue + # Use Pathlib to easily get the filename part of the key filename = Path(key).name if fnmatch.fnmatch(filename, pattern): From 44ef2d630b16c87408476b1ab5825e7eff3b4e1d Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 12:22:12 -0800 Subject: [PATCH 08/20] fix: logic in create_neuroglancer_link() --- src/see_spot/app.py | 3 +-- src/see_spot/ng_utils.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index 942a012..bdeb879 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -528,8 +528,7 @@ async def create_neuroglancer_link(request: Request): spot_id = data.get("spot_id") position = data.get("position") point_annotation = data.get("point_annotation") - if not position or not point_annotation or not spot_id: - annotation_color = data.get("annotation_color", "#FFFF00") + annotation_color = data.get("annotation_color", "#FFFF00") # Input validation if not position or not point_annotation or not spot_id: diff --git a/src/see_spot/ng_utils.py b/src/see_spot/ng_utils.py index 49e6498..a17c15f 100644 --- a/src/see_spot/ng_utils.py +++ b/src/see_spot/ng_utils.py @@ -255,7 +255,7 @@ def create_link_from_json( position, spot_id, point_annotation, - annotation_color="#FFFF00", + annotation_color="#FF0000", spacing=3.0, cross_section_scale=None, base_url="https://neuroglancer-demo.appspot.com", From 30f8a596524b1fd6e0ac13b7f81f5d020c97e4ba Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 12:36:07 -0800 Subject: [PATCH 09/20] add some logging to ng function in app.py --- src/see_spot/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index bdeb879..4050100 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -537,8 +537,10 @@ async def create_neuroglancer_link(request: Request): content={"error": "Missing required parameters: position, point_annotation, or spot_id"} ) # Check if we should use the JSON-based method (when "merged" is in the pkl filename) - unmixed_spots_filename = df_cache.get("unmixed_spots_filename") or "" + unmixed_spots_filename = df_cache.get("unmixed_spots_filename", "no filename found") + logger.info(f"Unmixed spots filename: {unmixed_spots_filename}") use_json_method = "merged" in unmixed_spots_filename.lower() + logger.info(f"Using JSON-based method: {use_json_method}") try: if use_json_method: # Use the JSON-based method for merged datasets From 272182ee0176b4f76e9561c2e1659f065473eb6f Mon Sep 17 00:00:00 2001 From: Carson Berry Date: Fri, 7 Nov 2025 14:55:11 -0800 Subject: [PATCH 10/20] try except for ng mode creation instead of using filename --- src/see_spot/app.py | 53 +++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index 4050100..58f839f 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -521,8 +521,18 @@ async def get_real_spots_data( async def create_neuroglancer_link(request: Request): """Creates a neuroglancer link with a point annotation at specified coordinates.""" # Parse the JSON data from the request + """const requestData = { + fused_s3_paths: fusedS3Paths, + position: [details.x, details.y, details.z, 0], + point_annotation: [details.x, details.y, details.z, 0.5, 0], + cell_id: details.cell_id || 42, + spot_id: spotId, + annotation_color: "#FFFF00", + cross_section_scale: 0.2 + };""" data = await request.json() + # Extract the parameters from the request cross_section_scale = data.get("cross_section_scale", "0.135") spot_id = data.get("spot_id") @@ -539,28 +549,29 @@ async def create_neuroglancer_link(request: Request): # Check if we should use the JSON-based method (when "merged" is in the pkl filename) unmixed_spots_filename = df_cache.get("unmixed_spots_filename", "no filename found") logger.info(f"Unmixed spots filename: {unmixed_spots_filename}") - use_json_method = "merged" in unmixed_spots_filename.lower() - logger.info(f"Using JSON-based method: {use_json_method}") + # use_json_method = "merged" in unmixed_spots_filename.lower() + # logger.info(f"Using JSON-based method: {use_json_method}") try: - if use_json_method: - # Use the JSON-based method for merged datasets - logger.info(f"Using create_link_from_json method for merged dataset (filename: {unmixed_spots_filename})") - - # Construct the neuroglancer JSON path - ng_json_path = f"s3://{S3_BUCKET}/{DATA_PREFIX}/phase_correlation_stitching_neuroglancer.json" - logger.info(f"Neuroglancer JSON path: {ng_json_path}") - - # Create the neuroglancer link from JSON - ng_link = ng_utils.create_link_from_json( - ng_json_path=ng_json_path, - position=position, - spot_id=spot_id, - point_annotation=point_annotation, - annotation_color=annotation_color, - spacing=3.0, - cross_section_scale=cross_section_scale - ) - else: + # if use_json_method: + try: + # Use the JSON-based method for merged datasets + logger.info(f"Using create_link_from_json method for merged dataset (filename: {unmixed_spots_filename})") + + # Construct the neuroglancer JSON path + ng_json_path = f"s3://{S3_BUCKET}/{DATA_PREFIX}/phase_correlation_stitching_neuroglancer.json" + logger.info(f"Neuroglancer JSON path: {ng_json_path}") + + # Create the neuroglancer link from JSON + ng_link = ng_utils.create_link_from_json( + ng_json_path=ng_json_path, + position=position, + spot_id=spot_id, + point_annotation=point_annotation, + annotation_color=annotation_color, + spacing=3.0, + cross_section_scale=cross_section_scale + ) + except Exception as e: # Use the traditional method for non-merged datasets logger.info(f"Using create_link_no_upload method for non-merged dataset (filename: {unmixed_spots_filename})") fused_s3_paths = data.get("fused_s3_paths") From 54b0ed7fe04edf8fc2fcf9614123a86bbae0eee8 Mon Sep 17 00:00:00 2001 From: Matt Davis Date: Fri, 7 Nov 2025 16:43:40 -0800 Subject: [PATCH 11/20] feat: fixes for single channel --- src/see_spot/app.py | 195 +++++++++++++++++++++++---------- src/see_spot/logging_config.py | 57 ++++++++++ src/see_spot/ng_utils.py | 41 +++---- src/sessions.db | Bin 0 -> 16384 bytes tests/test_dataset_loading.py | 76 +++++++++++++ 5 files changed, 288 insertions(+), 81 deletions(-) create mode 100644 src/see_spot/logging_config.py create mode 100644 src/sessions.db create mode 100644 tests/test_dataset_loading.py diff --git a/src/see_spot/app.py b/src/see_spot/app.py index 58f839f..201289c 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -2,14 +2,11 @@ from fastapi.responses import JSONResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates -import numpy as np -import pandas as pd -from datetime import datetime, timedelta +from datetime import datetime from see_spot import ng_utils import uvicorn import logging import os -import json from pathlib import Path import polars as pl import itertools @@ -17,6 +14,7 @@ # Import your modules from see_spot.s3_handler import s3_handler +from see_spot.logging_config import setup_logging from see_spot.s3_utils import ( find_unmixed_spots_file, find_related_files, load_ratios_from_s3, load_summary_stats_from_s3, @@ -24,7 +22,8 @@ find_processing_manifest ) -logging.basicConfig(level=logging.INFO) +# Initialize logging using central utility (idempotent) +setup_logging(os.getenv("SEE_SPOT_LOG_LEVEL", "INFO")) logger = logging.getLogger(__name__) app = FastAPI() @@ -61,6 +60,7 @@ def get_channel_pairs(df: pl.DataFrame) -> List[Tuple[str, str]]: channels = sorted([col.split('_')[1] for col in intensity_cols]) return list(itertools.combinations(channels, 2)) + def calculate_sankey_data_from_polars(df_polars: pl.DataFrame) -> Dict[str, Any]: """ Calculate Sankey diagram data directly from Polars DataFrame for maximum performance. @@ -421,17 +421,35 @@ async def get_real_spots_data( detail_cols = ['spot_id', 'cell_id', 'round', 'z', 'y', 'x'] available_detail_cols = [col for col in detail_cols if col in plot_df.columns] - if len(available_detail_cols) > 1: - logger.info(f"Creating spot_details with columns: {available_detail_cols}") + # Normalize spot_id to int if float to avoid '63.0' style keys + if 'spot_id' in plot_df.columns and plot_df['spot_id'].dtype.kind == 'f': + if plot_df['spot_id'].isna().any(): + logger.warning("spot_id column has NaNs before coercion; keys may be inconsistent") + logger.info("Coercing float spot_id column to int64 for clean spot_details keys") + try: + plot_df['spot_id'] = plot_df['spot_id'].astype('int64') + except Exception as e: + logger.error(f"Failed coercing spot_id to int64: {e}") + + if len(available_detail_cols) >= 1: + logger.info(f"Building spot_details from columns: {available_detail_cols}") spot_details_df = plot_df[available_detail_cols].copy() - - spot_details = { - str(row['spot_id']): { - col: row[col] for col in available_detail_cols if col != 'spot_id' + # Ensure spot_id is present + if 'spot_id' not in spot_details_df.columns: + logger.warning("spot_id column missing; cannot build spot_details") + spot_details = {} + else: + spot_details = { + str(int(row['spot_id'])): { + col: row[col] for col in available_detail_cols if col != 'spot_id' + } + for _, row in spot_details_df.iterrows() } - for _, row in spot_details_df.iterrows() - } - logger.info(f"Created spot_details dictionary with {len(spot_details)} entries") + logger.info( + "spot_details built: %d entries | sample keys: %s", + len(spot_details), + list(spot_details.keys())[:5] + ) else: spot_details = {} logger.warning("Could not create spot_details: required columns not found in DataFrame") @@ -531,8 +549,6 @@ async def create_neuroglancer_link(request: Request): cross_section_scale: 0.2 };""" data = await request.json() - - # Extract the parameters from the request cross_section_scale = data.get("cross_section_scale", "0.135") spot_id = data.get("spot_id") @@ -546,44 +562,100 @@ async def create_neuroglancer_link(request: Request): status_code=400, content={"error": "Missing required parameters: position, point_annotation, or spot_id"} ) - # Check if we should use the JSON-based method (when "merged" is in the pkl filename) - unmixed_spots_filename = df_cache.get("unmixed_spots_filename", "no filename found") - logger.info(f"Unmixed spots filename: {unmixed_spots_filename}") - # use_json_method = "merged" in unmixed_spots_filename.lower() - # logger.info(f"Using JSON-based method: {use_json_method}") - try: - # if use_json_method: - try: - # Use the JSON-based method for merged datasets - logger.info(f"Using create_link_from_json method for merged dataset (filename: {unmixed_spots_filename})") - - # Construct the neuroglancer JSON path - ng_json_path = f"s3://{S3_BUCKET}/{DATA_PREFIX}/phase_correlation_stitching_neuroglancer.json" - logger.info(f"Neuroglancer JSON path: {ng_json_path}") - - # Create the neuroglancer link from JSON - ng_link = ng_utils.create_link_from_json( - ng_json_path=ng_json_path, - position=position, - spot_id=spot_id, - point_annotation=point_annotation, - annotation_color=annotation_color, - spacing=3.0, - cross_section_scale=cross_section_scale + # Enhanced logging & dynamic JSON path logic + unmixed_spots_filename = df_cache.get("unmixed_spots_filename", "") + logger.info( + "Neuroglancer link request | spot_id=%s | unmixed_spots_file=%s | json_override_env=%s", + spot_id, + unmixed_spots_filename, + os.getenv("SEE_SPOT_NG_JSON_NAME", "") + ) + logger.debug( + "Raw request data keys: %s", ",".join(sorted(data.keys())) + ) + logger.debug( + "Position=%s point_annotation=%s annotation_color=%s cross_section_scale=%s", + position, + point_annotation, + annotation_color, + cross_section_scale, + ) + + # Determine JSON file name (env override allowed) and full S3 path + ng_json_filename = os.getenv( + "SEE_SPOT_NG_JSON_NAME", "phase_correlation_stitching_neuroglancer.json" + ) + ng_json_path = f"s3://{S3_BUCKET}/{DATA_PREFIX}/{ng_json_filename}" + s3_key_for_json = f"{DATA_PREFIX}/{ng_json_filename}" # key relative to bucket + logger.info("Constructed Neuroglancer JSON path: %s", ng_json_path) + + # Check existence of JSON on S3 (metadata only) for better diagnostics + json_metadata = None + try: + json_metadata = s3_handler.get_object_metadata( + s3_key_for_json, bucket_name=S3_BUCKET + ) + if json_metadata: + logger.info( + "Neuroglancer JSON found on S3 (ContentLength=%s LastModified=%s)", + json_metadata.get("ContentLength"), + json_metadata.get("LastModified"), ) - except Exception as e: - # Use the traditional method for non-merged datasets - logger.info(f"Using create_link_no_upload method for non-merged dataset (filename: {unmixed_spots_filename})") - fused_s3_paths = data.get("fused_s3_paths") - cell_id = data.get("cell_id", 42) # Default value if not provided + else: + logger.warning( + "Neuroglancer JSON NOT found on S3: s3://%s/%s", S3_BUCKET, s3_key_for_json + ) + except Exception as meta_err: + logger.error( + "Error checking Neuroglancer JSON metadata: %s", meta_err, exc_info=True + ) + + # Decide strategy: prefer JSON-based method when file exists; fall back otherwise + use_json_method = json_metadata is not None or "merged" in unmixed_spots_filename.lower() + logger.info("Use JSON method decision: %s", use_json_method) + + try: + if use_json_method: + logger.info( + "Attempting create_link_from_json(spot_id=%s, path=%s)", + spot_id, + ng_json_path, + ) + try: + ng_link = ng_utils.create_link_from_json( + ng_json_path=ng_json_path, + position=position, + spot_id=spot_id, + point_annotation=point_annotation, + annotation_color=annotation_color, + spacing=3.0, + cross_section_scale=cross_section_scale, + ) + logger.info("Successfully built Neuroglancer link from JSON") + except Exception as json_err: + logger.error( + "JSON link creation failed: %s | Falling back to direct method", + json_err, + exc_info=True, + ) + use_json_method = False # force fallback + if not use_json_method: + logger.info("Falling back to create_link_no_upload() pathway") + fused_s3_paths = data.get("fused_s3_paths") + cell_id = data.get("cell_id", 42) if not fused_s3_paths: + logger.error( + "Missing fused_s3_paths for fallback method; cannot proceed" + ) return JSONResponse( status_code=400, - content={"error": "Missing required parameter: fused_s3_paths (required for non-merged datasets)"} + content={ + "error": "Missing required parameter: fused_s3_paths for fallback Neuroglancer generation", + "attempted_json_path": ng_json_path, + "json_exists": json_metadata is not None, + }, ) - - # Create the neuroglancer link ng_link = ng_utils.create_link_no_upload( fused_s3_paths, annotation_color=annotation_color, @@ -591,15 +663,22 @@ async def create_neuroglancer_link(request: Request): cell_id=cell_id, spot_id=spot_id, position=position, - point_annotation=point_annotation + point_annotation=point_annotation, ) - - return {"url": ng_link} + logger.info( + "Successfully built Neuroglancer link via fallback direct method" + ) + + return {"url": ng_link, "used_json_method": use_json_method} except Exception as e: - logger.error(f"Error creating neuroglancer link: {str(e)}") + logger.error("Error creating neuroglancer link: %s", e, exc_info=True) return JSONResponse( status_code=500, - content={"error": f"Failed to create neuroglancer link: {str(e)}"} + content={ + "error": f"Failed to create neuroglancer link: {str(e)}", + "attempted_json_path": ng_json_path, + "json_exists": json_metadata is not None, + }, ) @app.get("/api/datasets") @@ -636,6 +715,8 @@ async def list_datasets(): logger.error(f"Error listing datasets: {e}", exc_info=True) return JSONResponse(status_code=500, content={"error": str(e)}) + + @app.post("/api/datasets/download") async def download_dataset(request: Request): """Download a dataset from S3 to local cache.""" @@ -651,9 +732,9 @@ async def download_dataset(request: Request): if not manifest_key: return JSONResponse( - status_code=404, + status_code=404, content={ - "error": f"Dataset not found on S3 - processing_manifest.json not found", + "error": "Dataset not found on S3 - processing_manifest.json not found", "checked_paths": [ f"s3://{S3_BUCKET}/{dataset_name}/processing_manifest.json", f"s3://{S3_BUCKET}/{dataset_name}/derived/processing_manifest.json" @@ -679,7 +760,7 @@ async def download_dataset(request: Request): if not spots_file: return JSONResponse( - status_code=404, + status_code=404, content={ "error": "Spots data file not found", "checked_path": f"s3://{S3_BUCKET}/{spots_key}unmixed_spots_*.pkl"} @@ -735,6 +816,8 @@ async def download_dataset(request: Request): logger.error(f"Error downloading dataset: {e}", exc_info=True) return JSONResponse(status_code=500, content={"error": str(e)}) + + @app.post("/api/datasets/set-active") async def set_active_dataset(request: Request): """Set the active dataset for the application.""" diff --git a/src/see_spot/logging_config.py b/src/see_spot/logging_config.py new file mode 100644 index 0000000..852fc57 --- /dev/null +++ b/src/see_spot/logging_config.py @@ -0,0 +1,57 @@ +import sys +import logging +from logging.config import dictConfig + + +def setup_logging(level: str = "DEBUG") -> None: + """Configure application and uvicorn logging in a single, idempotent place. + + Call this exactly once near process start (before creating the FastAPI app). + Safe to call multiple times; subsequent calls are no-ops. + """ + if getattr(setup_logging, "_configured", False): # idempotent guard + return + + dictConfig({ + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s", + }, + "access": { + "format": ( + "%(asctime)s | %(levelname)-8s | uvicorn.access | " + "%(client_addr)s - %(request_line)s -> %(status_code)s" + ), + }, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "stream": sys.stdout, + "formatter": "default", + "level": level, + }, + "access_console": { + "class": "logging.StreamHandler", + "stream": sys.stdout, + "formatter": "access", + "level": level, + }, + }, + "loggers": { + # Project package + "see_spot": {"handlers": ["console"], "level": level, "propagate": False}, + # Uvicorn internals + "uvicorn": {"handlers": ["console"], "level": level, "propagate": True}, + "uvicorn.error": {"handlers": ["console"], "level": level, "propagate": False}, + "uvicorn.access": {"handlers": ["access_console"], "level": level, "propagate": False}, + # FastAPI / Starlette + "fastapi": {"handlers": ["console"], "level": level, "propagate": True}, + }, + "root": {"handlers": ["console"], "level": level}, + }) + + setup_logging._configured = True + logging.getLogger("see_spot.logging_config").debug("Logging configured (level=%s)", level) diff --git a/src/see_spot/ng_utils.py b/src/see_spot/ng_utils.py index a17c15f..d48e72a 100644 --- a/src/see_spot/ng_utils.py +++ b/src/see_spot/ng_utils.py @@ -281,44 +281,33 @@ def create_link_from_json( import json from pathlib import Path - # Convert to Path object for easier handling - json_path = ( - Path(ng_json_path) - if not isinstance(ng_json_path, Path) - else ng_json_path - ) + # Robust handling of S3 vs local paths: avoid Path() on s3:// to prevent scheme collapse + is_s3 = isinstance(ng_json_path, str) and ng_json_path.startswith("s3://") + json_path_str = ng_json_path if is_s3 else str(Path(ng_json_path)) - # Load the JSON file try: - if str(json_path).startswith("s3://"): - # Handle S3 paths - import boto3 - - s3_path = str(json_path)[5:] # Remove 's3://' + if is_s3: + s3_path = json_path_str[5:] # strip 's3://' parts = s3_path.split("/") bucket = parts[0] key = "/".join(parts[1:]) - + print(f"[ng_utils] Fetching Neuroglancer JSON from S3: bucket={bucket} key={key}") s3_client = boto3.client("s3") response = s3_client.get_object(Bucket=bucket, Key=key) json_content = response["Body"].read().decode("utf-8") state_dict = json.loads(json_content) print(f"Loaded Neuroglancer state from S3: s3://{bucket}/{key}") else: - # Handle local file paths - with open(json_path, "r") as f: + print(f"[ng_utils] Loading Neuroglancer JSON from local path: {json_path_str}") + with open(json_path_str, "r") as f: state_dict = json.load(f) - print(f"Loaded Neuroglancer state from local file: {json_path}") + print(f"Loaded Neuroglancer state from local file: {json_path_str}") except FileNotFoundError: - raise FileNotFoundError( - f"Neuroglancer JSON file not found: {json_path}" - ) + raise FileNotFoundError(f"Neuroglancer JSON file not found: {json_path_str}") except json.JSONDecodeError as e: - raise ValueError(f"Invalid JSON in file {json_path}: {e}") + raise ValueError(f"Invalid JSON in file {json_path_str}: {e}") except Exception as e: - raise Exception( - f"Error loading Neuroglancer JSON from {json_path}: {e}" - ) + raise Exception(f"Error loading Neuroglancer JSON from {json_path_str}: {e}") # Update position state_dict["position"] = position @@ -396,7 +385,6 @@ def read_zarr_resolution_boto(s3_path): Returns: list: Resolution in z,y,x order in micrometers """ - import boto3 import json # Parse the S3 path @@ -442,7 +430,10 @@ def read_zarr_resolution_boto(s3_path): for idx in [z_idx, y_idx, x_idx] ): print( - f"Found resolution from multiscales: {[scale[z_idx], scale[y_idx], scale[x_idx]]}" + ( + f"Found resolution from multiscales: " + f"{[scale[z_idx], scale[y_idx], scale[x_idx]]}" + ) ) return [ scale[z_idx], diff --git a/src/sessions.db b/src/sessions.db new file mode 100644 index 0000000000000000000000000000000000000000..20147ffdb0dcb7ac6e4f7714f64c169ebd825103 GIT binary patch literal 16384 zcmeI2&u`pB6vyp&vzwIeI;E=i6xBCUiJ|k;WBbH5EFvNa?ZzuL5@2TRv9wZ}K-QId zLW&Ul3H$+wJAXtD9N@-<3zr_aazTP4^<0? zR3<2ojvuKiV9uK3y3U(H;5g2y{kQDDDXy(tY&U&ZT~D1iKHcy7zdK&9>-4(*xBiz` zxq=-C00AHX1b_e#00KY&2mk>f00jOu0-v?rPJeyf{XDCbJ}^@=olcGpr_FU~b2J>J z!yp~p+a3muHQ2ZoTvE=uvq=%8!-r{bf3&qT80`n|4)&yI#A$x$ZiX!@@k7esW;9=$U$4 z9nIQVLq4NTt#x;4p&LkZgJ;*&O1-YSe(H3t-*9@r`d{@v>VDsq*T1$+C_n%R00AHX z1b_e#00KY&2)qmgPS!e|@lD1~mRUr#RI)^+h$Z68FceG9P>Fe12oW01WmQ!aQ-N@d zs0c`uaFSqg-V`TplCM(odCW~+!(#PP{m`%IE# zyi_Jsh#MXuuC+pnYO9MEIMNaeg^RQ(ni=MjSY+p)nFvON1ehk6CzM|z35&m!gfpv_ z*{8)PuP%4S{hCsw;ymZtpi+gU)g?j{$v8&F;E0es3R#)&sH)n#AW9}!K4&6iCW^<5 zh%*v)wkRX~3QhtMnNf;}K#ZK+Xm!T7YEh)jBU!KzGvkw(Hp4E<~X@ElMPEY(s|?MOxY=#YJIqsY;R;QXNck)674| z#D!fv7ffM7qB;{WHB~mSpNU=j205D2xS%3ODvUMaB-bc5R3aK{p^2e|B0?YO3qxHT z{r#nO{XX9QXGS9lKd<%PH7mB$d+PWf^q%^!_}u@(|Hl9EW%$s+GXeo100e*l5C8%| z00;m9AOHk_!2gTDhBtN>No93v$3+rM?b>mXw9j(wxJW9dRXZ+{AMt9(MG^r^uX*Fu zMe-Z7{l9x-*(MYq00e*l5C8%|00;m9AOHk_01yBIK;XYcpyS1 to build details) + present_count = len(required_detail_cols & cols) + self.assertGreater( + present_count, + 1, + msg=f"Insufficient detail columns for spot_details (have {present_count}, missing: {missing})", + ) + + def test_parquet_cached(self): + parquet_path = Path(f"/s3-cache/{BUCKET}/{DATASET}/{DATASET}.parquet") + # Trigger merge first to ensure file creation + _ = load_and_merge_spots_from_s3(BUCKET, DATASET, SPOTS_PREFIX, valid_spots_only=True) + self.assertTrue(parquet_path.exists(), "Merged parquet file not cached") + self.assertGreater(parquet_path.stat().st_size, 0, "Parquet file size is zero") + + def test_s3_paths_accessible(self): + # Metadata check for a representative object (manifest or unmixed file) + manifest_key = find_processing_manifest(BUCKET, DATASET) + self.assertIsNotNone(manifest_key, "Manifest key missing for accessibility check") + meta = s3_handler.get_object_metadata(manifest_key, bucket_name=BUCKET) + self.assertIsNotNone(meta, "Unable to retrieve metadata for manifest file") + self.assertIn("ContentLength", meta, "Metadata lacks ContentLength") + + +if __name__ == "__main__": + unittest.main() From 9ebe516972aa48fab6be97dbbcb77bdcdfd7d5a5 Mon Sep 17 00:00:00 2001 From: Matt Davis Date: Thu, 13 Nov 2025 14:33:50 -0800 Subject: [PATCH 12/20] feat: DataTables --- src/see_spot/app.py | 10 +- src/see_spot/logging_config.py | 5 +- src/see_spot/static/js/unmixed_spots.js | 165 +++++++++++++++---- src/see_spot/templates/unmixed_spots.html | 183 ++++++++++++++++++---- 4 files changed, 296 insertions(+), 67 deletions(-) diff --git a/src/see_spot/app.py b/src/see_spot/app.py index 201289c..c4729cc 100644 --- a/src/see_spot/app.py +++ b/src/see_spot/app.py @@ -520,7 +520,8 @@ async def get_real_spots_data( "channel_pairs": channel_pairs, "spots_data": data_for_frontend, "spot_details": spot_details, - "fused_s3_paths": fused_s3_paths + "fused_s3_paths": fused_s3_paths, + "current_dataset": DATA_PREFIX # Include current dataset name } if ratios_json: @@ -681,6 +682,7 @@ async def create_neuroglancer_link(request: Request): }, ) + @app.get("/api/datasets") async def list_datasets(): """List all available datasets in the local cache.""" @@ -716,7 +718,6 @@ async def list_datasets(): return JSONResponse(status_code=500, content={"error": str(e)}) - @app.post("/api/datasets/download") async def download_dataset(request: Request): """Download a dataset from S3 to local cache.""" @@ -817,7 +818,6 @@ async def download_dataset(request: Request): return JSONResponse(status_code=500, content={"error": str(e)}) - @app.post("/api/datasets/set-active") async def set_active_dataset(request: Request): """Set the active dataset for the application.""" @@ -856,11 +856,13 @@ async def set_active_dataset(request: Request): logger.error(f"Error setting active dataset: {e}", exc_info=True) return JSONResponse(status_code=500, content={"error": str(e)}) + @app.get("/") @app.get("/unmixed-spots") async def unmixed_spots_page(request: Request): logger.info("Unmixed spots page accessed") return templates.TemplateResponse("unmixed_spots.html", {"request": request}) + if __name__ == '__main__': - uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/src/see_spot/logging_config.py b/src/see_spot/logging_config.py index 852fc57..46c90d0 100644 --- a/src/see_spot/logging_config.py +++ b/src/see_spot/logging_config.py @@ -20,10 +20,7 @@ def setup_logging(level: str = "DEBUG") -> None: "format": "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s", }, "access": { - "format": ( - "%(asctime)s | %(levelname)-8s | uvicorn.access | " - "%(client_addr)s - %(request_line)s -> %(status_code)s" - ), + "format": "%(asctime)s | %(levelname)-8s | uvicorn.access | %(message)s", }, }, "handlers": { diff --git a/src/see_spot/static/js/unmixed_spots.js b/src/see_spot/static/js/unmixed_spots.js index 71880a0..07576ee 100644 --- a/src/see_spot/static/js/unmixed_spots.js +++ b/src/see_spot/static/js/unmixed_spots.js @@ -21,8 +21,8 @@ document.addEventListener('DOMContentLoaded', function () { const highlightRemovedToggle = document.getElementById('highlight_removed_toggle'); const highlightRemovedStatus = document.getElementById('highlight_removed_status'); const displayChanSelect = document.getElementById('display_chan_select'); - const validSpotToggle = document.getElementById('valid_spot_toggle'); - const validSpotStatus = document.getElementById('valid_spot_status'); + // const validSpotToggle = document.getElementById('valid_spot_toggle'); + // const validSpotStatus = document.getElementById('valid_spot_status'); const xlimMin = document.getElementById('xlim_min'); const xlimMax = document.getElementById('xlim_max'); const ylimMin = document.getElementById('ylim_min'); @@ -34,6 +34,10 @@ document.addEventListener('DOMContentLoaded', function () { const summaryBarChartDom = document.getElementById('summary-bar-chart'); const summaryHeatmapDom = document.getElementById('summary-heatmap'); const futureChartDom = document.getElementById('future-chart'); + const spotsContainerHeader = document.getElementById('spots_container_header'); + const spotsContainerContent = document.getElementById('spots_container_content'); + const spotsContainerToggle = document.getElementById('spots_container_toggle'); + const selectedSpotsCount = document.getElementById('selected_spots_count'); const myChart = echarts.init(chartDom); const summaryBarChart = echarts.init(summaryBarChartDom); @@ -49,7 +53,7 @@ document.addEventListener('DOMContentLoaded', function () { let currentSampleSize = parseInt(sampleSizeInput.value) || 10000; let highlightReassigned = false; let highlightRemoved = false; - let displayChanMode = 'unmixed'; // 'unmixed' or 'mixed' + let displayChanMode = 'mixed'; // 'unmixed' or 'mixed' let isNeuroglancerMode = false; let spotDetails = {}; // Will store the spot details for neuroglancer lookup let fusedS3Paths = {}; // Will store the fused S3 paths from the API @@ -57,6 +61,7 @@ document.addEventListener('DOMContentLoaded', function () { let ratiosMatrix = null; // Will store the ratios matrix from the API let sankeyData = null; // Will store the sankey flow data from the API let selectedSpots = new Set(); + let currentDatasetName = 'Unknown Dataset'; // Track current dataset name // Chart limits variables let chartLimitsMode = 'auto'; // 'auto', 'fixed', 'minmax', 'percentile' @@ -111,7 +116,15 @@ document.addEventListener('DOMContentLoaded', function () { }); } + let dataTable = null; // Store DataTables instance + function updateDatasetTable() { + // Destroy existing DataTable if it exists + if (dataTable) { + dataTable.destroy(); + } + + // Clear table body datasetTableBody.innerHTML = ''; datasetList.forEach(dataset => { @@ -135,19 +148,36 @@ document.addEventListener('DOMContentLoaded', function () { statusText = 'Missing'; } - // Truncate long dataset names for display - const displayName = dataset.name.length > 35 ? - dataset.name.substring(0, 32) + '...' : dataset.name; - + // No truncation - show full dataset name + // Format date to show only date (YYYY-MM-DD) without time + const dateOnly = dataset.creation_date.split(' ')[0]; row.innerHTML = ` - ${displayName} - ${dataset.creation_date} + ${dataset.name} + ${dateOnly} ${statusText} `; row.addEventListener('click', () => selectDataset(dataset.name, row)); datasetTableBody.appendChild(row); }); + + // Initialize DataTables with custom configuration + dataTable = $('#dataset_table').DataTable({ + paging: false, // Disable pagination since we have limited datasets + searching: true, // Enable search box + ordering: true, // Enable column sorting + info: false, // Hide "Showing X to Y of Z entries" text + columnDefs: [ + { width: "45%", targets: 0 }, // Dataset Name column + { width: "35%", targets: 1 }, // Date Added column + { width: "20%", targets: 2, orderable: false } // Status column (no sorting) + ], + language: { + search: "Filter datasets:", + searchPlaceholder: "e.g., HCR_76710" + }, + order: [[1, 'desc']] // Sort by Date Added (newest first) by default + }); } function selectDataset(datasetName, rowElement) { @@ -206,6 +236,47 @@ document.addEventListener('DOMContentLoaded', function () { }, 5000); } + // Function to update the dataset title display + function updateDatasetTitle(datasetName) { + console.log('updateDatasetTitle called with:', datasetName); + const titleElement = document.getElementById('dataset-title'); + const nameSpan = titleElement.querySelector('.dataset-name'); + + console.log('titleElement:', titleElement); + console.log('nameSpan:', nameSpan); + + if (!datasetName || datasetName === 'Unknown Dataset') { + console.log('No valid dataset name, showing loading state'); + titleElement.classList.add('loading'); + nameSpan.textContent = 'Loading dataset...'; + return; + } + + // Remove loading state + titleElement.classList.remove('loading'); + + // Format the dataset name for better readability + // Extract key parts: HCR_ID, capture date, processing date + const parts = datasetName.split('_'); + let formattedName = datasetName; + + if (parts.length >= 3 && parts[0] === 'HCR') { + const hcrId = parts[1]; + const captureDate = parts[2]; // YYYY-MM-DD format + formattedName = `HCR ${hcrId} (${captureDate})`; + } + + // Update the display + //nameSpan.textContent = formattedName; + nameSpan.textContent = datasetName; // Show full name, not formatted MJD + nameSpan.title = datasetName; // Full name in tooltip + + // Store current dataset name + currentDatasetName = datasetName; + + console.log(`Dataset title updated: ${formattedName}`); + } + // Dataset management event listeners downloadDatasetBtn.addEventListener('click', function() { const datasetName = datasetNameInput.value.trim(); @@ -329,6 +400,25 @@ document.addEventListener('DOMContentLoaded', function () { // Initialize dataset management loadDatasetList(); + // Toggle collapsible Selected Spots section + spotsContainerHeader.addEventListener('click', function() { + const isCollapsed = spotsContainerContent.classList.contains('collapsed'); + + if (isCollapsed) { + spotsContainerContent.classList.remove('collapsed'); + spotsContainerToggle.classList.remove('collapsed'); + } else { + spotsContainerContent.classList.add('collapsed'); + spotsContainerToggle.classList.add('collapsed'); + } + }); + + // Function to update the selected spots count + function updateSelectedSpotsCount() { + const count = spotsTableBody.rows.length; + selectedSpotsCount.textContent = count; + } + // Update current label when input changes labelInput.addEventListener('input', function() { currentLabel = labelInput.value.trim(); @@ -430,7 +520,7 @@ document.addEventListener('DOMContentLoaded', function () { // Fetch data function function fetchData(sampleSize, forceRefresh = false) { - const validSpotsOnly = validSpotToggle.checked; + const validSpotsOnly = false; // validSpotToggle.checked; // Toggle disabled const url = `/api/real_spots_data?sample_size=${sampleSize}${forceRefresh ? '&force_refresh=true' : ''}${validSpotsOnly ? '&valid_spots_only=true' : '&valid_spots_only=false'}`; console.log(`Fetching data with URL: ${url}`); @@ -442,11 +532,20 @@ document.addEventListener('DOMContentLoaded', function () { return response.json(); }) .then(data => { - console.log(`Fetched unmixed spots data with sample size ${sampleSize}:`, data); + console.log(`Fetched spots data with sample size ${sampleSize}:`, data); + console.log('Current dataset from API:', data.current_dataset); if (!data.spots_data || !data.channel_pairs || data.spots_data.length === 0) { throw new Error("Invalid or empty data received from API"); } + + // Update dataset title if available + if (data.current_dataset) { + console.log('Calling updateDatasetTitle with:', data.current_dataset); + updateDatasetTitle(data.current_dataset); + } else { + console.warn('No current_dataset field in API response'); + } channelPairs = data.channel_pairs; const spotsData = data.spots_data; @@ -1102,6 +1201,9 @@ document.addEventListener('DOMContentLoaded', function () { const labelCell = newRow.insertCell(); labelCell.textContent = label || ''; // Label + + // Update the count + updateSelectedSpotsCount(); } // Event listener for the clear button @@ -1111,6 +1213,9 @@ document.addEventListener('DOMContentLoaded', function () { spotsTableBody.removeChild(spotsTableBody.firstChild); } console.log("Cleared selected spots table."); + + // Update the count + updateSelectedSpotsCount(); }); // Event listener for adding lasso selection to table @@ -1390,25 +1495,25 @@ document.addEventListener('DOMContentLoaded', function () { updateChart(); }); - // Event listener for valid spot toggle - validSpotToggle.addEventListener('change', function() { - validSpotStatus.textContent = this.checked ? 'On' : 'Off'; - - // Update toggle style - const toggleLabel = this.nextElementSibling; - const toggleSpan = toggleLabel.querySelector('span'); - - if (this.checked) { - toggleLabel.style.backgroundColor = '#4CAF50'; // Green when active - toggleSpan.style.left = '22px'; - } else { - toggleLabel.style.backgroundColor = '#ccc'; // Gray when inactive - toggleSpan.style.left = '2px'; - } - - // Reload data with new filter setting - fetchData(currentSampleSize, false); - }); + // Event listener for valid spot toggle - COMMENTED OUT + // validSpotToggle.addEventListener('change', function() { + // validSpotStatus.textContent = this.checked ? 'On' : 'Off'; + // + // // Update toggle style + // const toggleLabel = this.nextElementSibling; + // const toggleSpan = toggleLabel.querySelector('span'); + // + // if (this.checked) { + // toggleLabel.style.backgroundColor = '#4CAF50'; // Green when active + // toggleSpan.style.left = '22px'; + // } else { + // toggleLabel.style.backgroundColor = '#ccc'; // Gray when inactive + // toggleSpan.style.left = '2px'; + // } + // + // // Reload data with new filter setting + // fetchData(currentSampleSize, false); + // }); // Chart limits event listeners function updateButtonStates() { diff --git a/src/see_spot/templates/unmixed_spots.html b/src/see_spot/templates/unmixed_spots.html index 83d3cad..1aff735 100644 --- a/src/see_spot/templates/unmixed_spots.html +++ b/src/see_spot/templates/unmixed_spots.html @@ -6,6 +6,10 @@ Unmixed Spots Visualization + + + + @@ -368,6 +480,14 @@

Chart Limits

+ +
+

+ 📊 + Loading dataset... +

+
+
@@ -397,7 +517,7 @@

📊 Dataset Management

Dataset Name - Date + Added Status @@ -421,7 +541,7 @@

📊 Dataset Management

Refresh Cache
-
+
Small sample: full feature set @@ -441,8 +561,8 @@

📊 Dataset Management

@@ -484,25 +604,30 @@

📊 Dataset Management

-

Selected Spots

- - - - - - - - - - - - - - - - - -
Spot IDChannelX-ChannelY-ChannelRDistUnmixedReassignedLabel
+
+ Selected Spots (0) + +
+
From e94a3bac803a143fd8b772677b786cdf34844b1f Mon Sep 17 00:00:00 2001 From: Matt Davis Date: Thu, 13 Nov 2025 14:51:54 -0800 Subject: [PATCH 13/20] feat: range sliders for metrics --- src/see_spot/static/js/unmixed_spots.js | 221 ++++++++++++++++------ src/see_spot/templates/unmixed_spots.html | 59 +++++- 2 files changed, 224 insertions(+), 56 deletions(-) diff --git a/src/see_spot/static/js/unmixed_spots.js b/src/see_spot/static/js/unmixed_spots.js index 07576ee..b40cbe2 100644 --- a/src/see_spot/static/js/unmixed_spots.js +++ b/src/see_spot/static/js/unmixed_spots.js @@ -68,6 +68,12 @@ document.addEventListener('DOMContentLoaded', function () { let currentXLimits = [0, 2000]; let currentYLimits = [0, 2000]; + // Filter ranges for R Value and Distance + let rValueRange = [0, 100]; + let distanceRange = [0, 100]; + let rValueFilter = [0, 100]; + let distanceFilter = [0, 100]; + // Neuroglancer click debounce variables let lastNeuroglancerClickTime = 0; let lastNeuroglancerSpotId = null; @@ -399,6 +405,129 @@ document.addEventListener('DOMContentLoaded', function () { // Initialize dataset management loadDatasetList(); + + // Setup noUiSlider elements + const rValueSliderEl = document.getElementById('r_value_slider'); + const distanceSliderEl = document.getElementById('distance_slider'); + const rValueMinLabel = document.getElementById('r_value_min_label'); + const rValueMaxLabel = document.getElementById('r_value_max_label'); + const distanceMinLabel = document.getElementById('distance_min_label'); + const distanceMaxLabel = document.getElementById('distance_max_label'); + const resetFiltersBtn = document.getElementById('reset_filters_btn'); + + let rValueSlider = null; + let distanceSlider = null; + + // Function to update filter slider ranges based on data + function updateFilterSliderRanges() { + if (!allChartData || allChartData.length === 0) return; + if (typeof noUiSlider === 'undefined') { + console.error('noUiSlider library not loaded'); + return; + } + + const rValues = allChartData.typedArrays.r; + const distValues = allChartData.typedArrays.dist; + + // Calculate 99th percentile for better range + const rSorted = new Float32Array(rValues).sort(); + const distSorted = new Float32Array(distValues).sort(); + + const r99 = rSorted[Math.floor(rSorted.length * 0.99)] || 1.0; + const dist99 = distSorted[Math.floor(distSorted.length * 0.99)] || 5.0; + + // Cap at reasonable maximums: R at 1.0, Distance at 5.0 + const rMax = Math.min(r99, 1.0); + const distMax = Math.min(dist99, 5.0); + + rValueRange = [0, rMax]; + distanceRange = [0, distMax]; + rValueFilter = [0, rMax]; + distanceFilter = [0, distMax]; + + // Destroy existing sliders if they exist + if (rValueSlider) { + rValueSlider.destroy(); + } + if (distanceSlider) { + distanceSlider.destroy(); + } + + // Create R Value slider + rValueSlider = noUiSlider.create(rValueSliderEl, { + start: [0, rMax], + connect: true, + range: { + 'min': 0, + 'max': rMax + }, + step: 0.01, + tooltips: [true, true], + format: { + to: function(value) { + return value.toFixed(2); + }, + from: function(value) { + return Number(value); + } + } + }); + + // Create Distance slider + distanceSlider = noUiSlider.create(distanceSliderEl, { + start: [0, distMax], + connect: true, + range: { + 'min': 0, + 'max': distMax + }, + step: 0.05, + tooltips: [true, true], + format: { + to: function(value) { + return value.toFixed(2); + }, + from: function(value) { + return Number(value); + } + } + }); + + // Update labels + rValueMinLabel.textContent = '0.00'; + rValueMaxLabel.textContent = rMax.toFixed(2); + distanceMinLabel.textContent = '0.00'; + distanceMaxLabel.textContent = distMax.toFixed(2); + + // Add event listeners for R Value slider - use 'set' event to avoid excessive updates + rValueSlider.on('set', function(values, handle) { + rValueFilter[0] = parseFloat(values[0]); + rValueFilter[1] = parseFloat(values[1]); + updateChart(); + }); + + // Add event listeners for Distance slider - use 'set' event to avoid excessive updates + distanceSlider.on('set', function(values, handle) { + distanceFilter[0] = parseFloat(values[0]); + distanceFilter[1] = parseFloat(values[1]); + updateChart(); + }); + + console.log(`Filter ranges - R: [0, ${rMax.toFixed(2)}], Distance: [0, ${distMax.toFixed(2)}]`); + } + + // Reset filters button + resetFiltersBtn.addEventListener('click', function() { + if (rValueSlider && distanceSlider) { + rValueSlider.set([0, rValueRange[1]]); + distanceSlider.set([0, distanceRange[1]]); + + rValueFilter = [0, rValueRange[1]]; + distanceFilter = [0, distanceRange[1]]; + + updateChart(); + } + }); // Toggle collapsible Selected Spots section spotsContainerHeader.addEventListener('click', function() { @@ -609,6 +738,9 @@ document.addEventListener('DOMContentLoaded', function () { // Create channel selector buttons createChannelSelector(); + + // Update filter slider ranges based on data + updateFilterSliderRanges(); // Set initial channel pair currentPairIndex = 0; @@ -720,6 +852,26 @@ document.addEventListener('DOMContentLoaded', function () { spotsData.removed = removed; } + // Function to apply R Value and Distance filters + function applyFilters(data) { + const filtered = []; + const rValues = data.typedArrays.r; + const distValues = data.typedArrays.dist; + + for (let i = 0; i < data.length; i++) { + const r = rValues[i]; + const dist = distValues[i]; + + // Check if point passes both filters + if (r >= rValueFilter[0] && r <= rValueFilter[1] && + dist >= distanceFilter[0] && dist <= distanceFilter[1]) { + filtered.push(i); + } + } + + return filtered; + } + function updateChart(newData = null) { if (newData) { allChartData = newData; @@ -753,6 +905,15 @@ document.addEventListener('DOMContentLoaded', function () { const xField = `chan_${xChan}_intensity`; const yField = `chan_${yChan}_intensity`; + // Apply filters to get indices of points that pass + const filteredIndices = applyFilters(allChartData); + + // Update filter count display + const filterCountEl = document.getElementById('filter_count'); + if (filterCountEl) { + filterCountEl.textContent = `Showing ${filteredIndices.length.toLocaleString()} of ${allChartData.length.toLocaleString()} points`; + } + // Create series data grouped by display channel (mixed or unmixed) const seriesData = {}; const uniqueChannels = []; @@ -768,7 +929,9 @@ document.addEventListener('DOMContentLoaded', function () { const reassigned = allChartData.reassigned; const removed = allChartData.removed; - for (let i = 0; i < allChartData.length; i++) { + // Only process filtered indices + for (let idx = 0; idx < filteredIndices.length; idx++) { + const i = filteredIndices[idx]; // Use either unmixed channel or original channel based on display mode let displayChan = displayChanMode === 'mixed' ? channels[i] : unmixedChans[i]; @@ -954,8 +1117,8 @@ document.addEventListener('DOMContentLoaded', function () { }, {}) }, grid: { - right: totalSliderWidth + sliderConfig.startRight + 40, // Make room for sliders and legend - bottom: 70 // Still need some bottom space for axis labels + right: 120, // Space for legend only + bottom: 70 // Space for axis labels }, tooltip: { trigger: 'item', @@ -1053,58 +1216,6 @@ document.addEventListener('DOMContentLoaded', function () { filterMode: 'empty' } ], - visualMap: [ - { - // R-value filter - right: sliderConfig.startRight, - top: 'center', - dimension: 2, // The 'r' value is at index 2 in each data point array - min: 0, - max: r99Percentile, - precision: 2, - text: ['R Value'], - textStyle: { - fontSize: 12 - }, - ...sliderConfig, - handleStyle: { - color: '#4285f4' - }, - inRange: { - opacity: 1 - }, - outOfRange: { - opacity: 0.01 - }, - seriesIndex: seriesIndices, // Explicitly set which series this visualMap controls - hoverLink: false // Disable hover highlight when using the slider - }, - { - // Distance filter - right: sliderConfig.startRight + sliderConfig.width + sliderConfig.gap, - top: 'center', - dimension: 6, // The 'dist' value is at index 6 in each data point array - min: 0, - max: dist99Percentile, - precision: 2, - text: ['Distance'], - textStyle: { - fontSize: 12 - }, - ...sliderConfig, - handleStyle: { - color: '#f83628ff' - }, - inRange: { - opacity: 1 - }, - outOfRange: { - opacity: 0.01 - }, - seriesIndex: seriesIndices, // Explicitly set which series this visualMap controls - hoverLink: false // Disable hover highlight when using the slider - } - ], series: series }; diff --git a/src/see_spot/templates/unmixed_spots.html b/src/see_spot/templates/unmixed_spots.html index 1aff735..e4457a9 100644 --- a/src/see_spot/templates/unmixed_spots.html +++ b/src/see_spot/templates/unmixed_spots.html @@ -10,9 +10,12 @@ + + +