GT-LIT-Lab · eyasayesh · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/encoding/assembly/assemblies.py b/encoding/assembly/assemblies.py
@@ -10,7 +10,7 @@
 class SimpleNeuroidAssembly:
     """Simple alternative to NeuroidAssembly that doesn't require brainio and Xarray."""
 
-    def __init__(self, story_data_list: List["StoryData"], validation_method: str):
+    def __init__(self, story_data_list: List["StoryData"], validation_method: str,is_volume:bool):
         """Initialize assembly with story-level separation.
 
         Args:
@@ -19,6 +19,7 @@ def __init__(self, story_data_list: List["StoryData"], validation_method: str):
         self.stories = [story.name for story in story_data_list]
         self.story_data = {story.name: story for story in story_data_list}
         self.validation_method = validation_method
+        self.is_volume = is_volume
         # Store combined data for backward compatibility
         self.data = np.vstack([story.brain_data for story in story_data_list])
 

diff --git a/encoding/assembly/assembly_generator.py b/encoding/assembly/assembly_generator.py
@@ -27,6 +27,7 @@ def create(
         mask_path: Optional[str] = None,
         analysis_mask_path: Optional[str] = None,
         tokenizer: Optional[GPT2Tokenizer] = None,
+        **kwargs,
     ) -> BaseAssemblyGenerator:
         """Create a dataset-specific assembly generator.
 
@@ -56,6 +57,7 @@ def create(
             mask_path,
             analysis_mask_path,
             tokenizer,
+            **kwargs
         )
 
     @staticmethod
@@ -72,6 +74,7 @@ def generate_assembly(
         generate_temporal_baseline: bool = False,
         analysis_mask_path: Optional[str] = None,
         tokenizer: Optional[GPT2Tokenizer] = None,
+        **kwargs,
     ) -> SimpleNeuroidAssembly:
         """Generate assembly for a subject using the appropriate dataset processor.
 
@@ -98,11 +101,13 @@ def generate_assembly(
             mask_path,
             analysis_mask_path,
             tokenizer,
+            **kwargs
         )
         return generator.generate_assembly(
             subject,
             lookback,
             context_type,
             correlation_length,
             generate_temporal_baseline,
+            **kwargs
         )
diff --git a/encoding/assembly/base_processor.py b/encoding/assembly/base_processor.py
@@ -131,8 +131,8 @@ def _process_fullcontext(
         total_len = len(transcript["word_orig"])
         ds_data = transcript["word_orig"].astype(str)
         stimuli = []
-        print(f"this is the lookback: {lookback}")
-        print(f"heloo")
+        #print(f"this is the lookback: {lookback}")
+        #print(f"heloo")
 
         for i, w in enumerate(ds_data):
             if w != "":
@@ -336,15 +336,22 @@ def compute_word_rate_features(
         return np.array(word_rates)  # Shape: (n_trs, 1)
 
     def process_transcript(
-        self, data_dir: str, story_name: str
+        self,
+        data_dir: str,
+        transcript_file: str,
+        story_name: str,
     ) -> Tuple[pd.DataFrame, List[int], np.ndarray, np.ndarray]:
         """Process transcript data and generate split indices and timing information."""
+        #TODO: Unify file structure for LITcoder
+        #TODO: make it so stimulis data is in a dictionary instead of a list, or make file store modular
         # read pickle file
-        with open(os.path.join(data_dir, f"{self.dataset_type}_data.pkl"), "rb") as f:
+        with open(os.path.join(data_dir, f"transcripts/{transcript_file}"), "rb") as f:
             data = pickle.load(f)
 
         # this is a list, iterate over it and find the story_name
-        story = next((s for s in data if s.get("story_name") == story_name), None)
+        #story = next((s for s in data if s.get("story_name") == story_name), None)
+        story = data[story_name]
+
         if story is None:
             available = [s.get("story_name") for s in data]
             raise ValueError(

diff --git a/encoding/assembly/lebel_processor.py b/encoding/assembly/lebel_processor.py
@@ -27,10 +27,12 @@ def __init__(
         mask_path: Optional[str] = None,
         analysis_mask_path: Optional[str] = None,
         tokenizer: Optional[GPT2Tokenizer] = None,
+        stories: Optional[List[str]] = None,
+        **kwargs,
     ):
         super().__init__(data_dir, dataset_type, tr, use_volume, mask_path, tokenizer)
         self.analysis_mask = analysis_mask_path
-        self.stories = [
+        self.stories = stories if stories is not None else [
             "adollshouse",
             "adventuresinsayingyes",
             "alternateithicatom",
@@ -65,6 +67,8 @@ def generate_assembly(
         context_type: str = "fullcontext",
         correlation_length: int = 100,
         generate_temporal_baseline: bool = False,
+        audio_path: Optional[str] = None,
+        **kwargs,
     ) -> SimpleNeuroidAssembly:
         """Generate assembly for a subject by processing all stories.
 
@@ -81,20 +85,21 @@ def generate_assembly(
         self.generate_temporal_baseline = generate_temporal_baseline
 
         # Process each story
+        #TODO: fix this to load big files once outside loop
         for story in self.stories:
-            audio_path = f"{self.data_dir}/audio_files/{story}.wav"
             story_data = self._process_single_story(
-                subject,
-                story,
-                None,
-                correlation_length,
-                generate_temporal_baseline,
+                subject=subject,
+                story_name=story,
+                correlation_length=correlation_length,
+                generate_temporal_baseline=generate_temporal_baseline,
                 audio_path=audio_path,
+                brain_resp_file= kwargs.get('brain_resp_file', 'brain_resp_huge.pkl'),
+                transcript_file= kwargs.get('transcript_file', 'lebel_transcripts.pkl' )
             )
             story_data_list.append(story_data)
 
         # Create assembly with story-level separation
-        return SimpleNeuroidAssembly(story_data_list, validation_method="outer")
+        return SimpleNeuroidAssembly(story_data_list, validation_method="outer",is_volume=self.use_volume)
 
     def _discover_stories(self, subject_dir: Path) -> List[Dict[str, str]]:
         """Discover all stories for a subject from the directory structure.
@@ -107,14 +112,16 @@ def _process_single_story(
         self,
         subject: str,
         story_name: str,
-        volume_path: str,
         correlation_length: int = 100,
         generate_temporal_baseline: bool = False,
         audio_path: Optional[str] = None,
+        brain_resp_file: Optional[str] = None,
+        transcript_file: Optional[str] = None
     ) -> StoryData:
         """Process a single story and return its data using a specified context type.
 
         Args:
+            subject: Subject identifier
             story_name: Name of the story being processed
             wordseq: Word sequence data for the story
             brain_data: Neural activity data for the story
@@ -125,18 +132,18 @@ def _process_single_story(
         Returns:
             StoryData object containing processed story information
         """
-        if self.use_volume:
-            with open(f"{self.data_dir}/noslice_sub-{subject}_story_data.pkl", "rb") as f:
-                resp_dict = pickle.load(f)
-        else:
-            with open(f"{self.data_dir}/noslice_sub-{subject}_story_data_surface.pkl", "rb") as f:
-                resp_dict = pickle.load(f)
+        #TODO: Unify file structure for LITcoder
+        with open(f"{self.data_dir}/{subject}/{brain_resp_file}", "rb") as f:
+            resp_dict = pickle.load(f)
+
         brain_data = resp_dict.get(story_name)
 
+
         transcript, split_indices, tr_times, data_times, _ = self.process_transcript(
-            self.data_dir,
-            story_name
-        )
+            self.data_dir, 
+            transcript_file,
+            story_name)
+
         stimuli = self.generate_stimuli_with_context(transcript, self.lookback)
 
         if self.analysis_mask is not None:

diff --git a/encoding/assembly/lpp_processor.py b/encoding/assembly/lpp_processor.py
@@ -22,6 +22,7 @@ def __init__(
         mask_path: Optional[str] = None,
         analysis_mask_path: Optional[str] = None,
         tokenizer: Optional[GPT2Tokenizer] = None,
+        **kwargs,
     ):
         super().__init__(data_dir, dataset_type, tr, use_volume, mask_path, tokenizer)
         self.analysis_mask = analysis_mask_path
@@ -69,7 +70,7 @@ def generate_assembly(
             story_data_list.append(story_data)
 
         # Create assembly with run-level separation
-        return SimpleNeuroidAssembly(story_data_list, validation_method="inner")
+        return SimpleNeuroidAssembly(story_data_list, validation_method="inner",is_volume=self.use_volume)
 
     def _discover_stories(
         self, subject_dir: Path, subject: str

diff --git a/encoding/assembly/narratives_processor.py b/encoding/assembly/narratives_processor.py
@@ -22,6 +22,7 @@ def __init__(
         mask_path: Optional[str] = None,
         analysis_mask_path: Optional[str] = None,
         tokenizer: Optional[GPT2Tokenizer] = None,
+        **kwargs,
     ):
         super().__init__(data_dir, dataset_type, tr, use_volume, mask_path, tokenizer)
         self.analysis_mask = analysis_mask_path
@@ -71,7 +72,7 @@ def generate_assembly(
             story_data_list.append(story_data)
 
         # Create assembly with story-level separation
-        return SimpleNeuroidAssembly(story_data_list, validation_method="inner")
+        return SimpleNeuroidAssembly(story_data_list, validation_method="inner",is_volume=self.use_volume)
 
     def _discover_stories(self, subject_dir: Path) -> List[Dict[str, str]]:
         """Discover all stories for a subject from the directory structure."""

diff --git a/encoding/assembly/story_data.py b/encoding/assembly/story_data.py
@@ -11,6 +11,7 @@ class StoryData:
     Attributes:
         name (str): Name identifier for the story/run
         brain_data (np.ndarray): Brain activation data, shape (n_timepoints, n_voxels/vertices)
+        is_volume (bool): true if brain data is volume data, false if surface
         stimuli (List[str]): List of text stimuli corresponding to each timepoint
         split_indices (List[int]): Indices marking TR boundaries in the data
         tr_times (np.ndarray): Array of TR timestamps

diff --git a/encoding/data_prep/__init__.py b/encoding/data_prep/__init__.py
diff --git a/encoding/data_prep/data_prep_utils.py b/encoding/data_prep/data_prep_utils.py
@@ -0,0 +1,153 @@
+"""Utils for preparing datasets before using litcode. 
+Adapted from Huth ridge utils. 
+TODO: Add proper citation"""
+from typing import List, Tuple, Union
+from pathlib import Path
+import json
+import numpy as np
+from .textgrid import TextGrid
+import pickle
+import h5py
+
+DEFAULT_BAD_WORDS = frozenset(["sentence_start", "sentence_end", "br", "lg", "ls", "ns", "sp"])
+
+
+
+
+##########Transcript Preprocessing Utils###########
+
+def create_lebel_transcripts( story_list: List[str], 
+                       textgrids_dir: Union[str, Path],
+                       respdict_path: Union[str, Path],
+                       output_dir: Union[str, Path],
+                       file_name: str = "lebel_transcripts.pkl"
+                       ) -> None:
+
+    """Create transcripts for the given stories and save them to the output directory.
+
+    Args:
+        story_list: List of story names to process
+        textgrids_dir: Directory containing the TextGrid files
+        respdict_path: Path to the response dictionary JSON file
+        output_dir: Directory to save the generated transcripts
+    """
+
+    text_grids = _load_textgrids(story_list, textgrids_dir)
+    with open(respdict_path, "r") as f:
+        respdict = json.load(f)
+    tr_times = _simulate_trtimes(story_list, respdict)
+    processed_transcripts = _process_textgrids(text_grids, tr_times)
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / file_name
+    with open(output_path, "wb") as f:
+        pickle.dump(processed_transcripts, f)
+
+
+def _load_textgrids(stories: List[str], textgrids_dir: Union[str, Path]) -> dict:
+    """Load TextGrid files for the given stories from the specified TextGrid directory.
+
+    Args:
+        stories: List of story names
+        data_dir: Directory containing the TextGrid files
+
+    Returns:
+        Dictionary mapping story names to their corresponding TextGrid objects
+    """
+
+    grids = {}
+    for story in stories:
+        grid_path = Path(textgrids_dir) / f"{story}.TextGrid"
+        grids[story] = TextGrid.load(grid_path)
+    return grids
+
+
+def _simulate_trtimes(stories: List[str], respdict: dict, tr: float = 2.0, start_time: float = 10.0, pad: int = 10) -> dict:
+    """Simulate TR times for the given stories based on the response dictionary.
+
+    Args:
+        stories: List of story names
+        respdict: Dictionary mapping story names to their response lengths
+        tr: Expected TR value
+        start_time: Start time for the simulation
+        pad: Padding to subtract from the response length
+
+    Returns:
+        Dictionary mapping story names to their simulated TR times
+    """
+    tr_times = {}
+    for story in stories:
+        resp_length = respdict.get(story, 0)
+        tr_times[story] = list(np.arange(-start_time, (resp_length - pad) * tr, tr))
+    return tr_times
+
+def _process_textgrids(text_grids: dict, 
+                       tr_times: dict, 
+                       bad_words: frozenset = DEFAULT_BAD_WORDS
+                    ) -> dict[dict]:
+    """Process the loaded TextGrid files to extract word sequences, filtering out bad words.
+
+    Args:
+        text_grids: Dictionary mapping story names to their corresponding TextGrid objects
+        bad_words: Set of words to filter out from the transcripts
+    """
+    processed_transcripts = {}
+    for story in text_grids.keys():
+        simple_transcript = text_grids[story].tiers[1].make_simple_transcript()
+        ## Filter out bad words
+        filtered_transcript = [x for x in simple_transcript if x[2].lower().strip("{}").strip() not in bad_words]
+        # Further processing can be done here as needed
+        processed_transcripts[story] = _process_single_story(filtered_transcript, tr_times[story])
+
+    return processed_transcripts
+
+def _process_single_story(processed_transcript: List[Tuple], 
+                          tr_times: List[float]) -> dict:
+    """Process a single story's transcript and TR times to create a structured representation.
+    Args:
+        proceesed_transcript: List of tuples representing the transcript (start_time, end_time, word)
+        tr_times: List of TR times for the story
+    Returns:   
+        Tuple containing processed story information
+    """
+
+    data_entries = list(zip(*processed_transcript))[2]
+    if isinstance(data_entries[0], str):
+        data = list(map(str.lower, list(zip(*processed_transcript))[2]))
+    else:
+        data = data_entries
+    word_starts = np.array(list(map(float, list(zip(*processed_transcript))[0])))
+    word_ends = np.array(list(map(float, list(zip(*processed_transcript))[1])))
+    word_avgtimes = (word_starts + word_ends)/2.0
+
+    tr = np.mean(np.diff(tr_times))
+    tr_midpoints = np.array(tr_times) + tr/2.0
+
+    split_inds = [(word_starts<(t+tr)).sum() for t in tr_times][:-1]
+    return {"words": data, "split_indices": split_inds, "data_times":word_avgtimes,"tr_times": tr_midpoints}
+
+def create_brain_response_dict(story_list: List[str], 
+                       resp_data_dir: Union[str, Path],
+                       output_dir: Union[str, Path],
+                       file_name: str = "brain_resp_huge.pkl"
+                       ) -> None:
+    """Create a dictionary of brain responses for the given stories and save it to the output path.
+
+    Args:
+        story_list: List of story names to process
+        neural_data_dir: Directory containing the neural data files
+        output_path: Path to save the generated brain response dictionary
+        file_name: Name of the output file
+    """
+
+    brain_responses = {}
+    for story in story_list:
+        resp_data_path = Path(resp_data_dir) / f"{story}.hf5"
+        with h5py.File(resp_data_path, "r") as f:
+            brain_responses[story] = f["data"][:]
+
+    output_dir = Path(output_dir)
+    output_dir.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_dir / file_name, "wb") as f:
+        pickle.dump(brain_responses, f)