Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion encoding/assembly/assemblies.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class SimpleNeuroidAssembly:
"""Simple alternative to NeuroidAssembly that doesn't require brainio and Xarray."""

def __init__(self, story_data_list: List["StoryData"], validation_method: str):
def __init__(self, story_data_list: List["StoryData"], validation_method: str,is_volume:bool):
"""Initialize assembly with story-level separation.

Args:
Expand All @@ -19,6 +19,7 @@ def __init__(self, story_data_list: List["StoryData"], validation_method: str):
self.stories = [story.name for story in story_data_list]
self.story_data = {story.name: story for story in story_data_list}
self.validation_method = validation_method
self.is_volume = is_volume
# Store combined data for backward compatibility
self.data = np.vstack([story.brain_data for story in story_data_list])

Expand Down
5 changes: 5 additions & 0 deletions encoding/assembly/assembly_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def create(
mask_path: Optional[str] = None,
analysis_mask_path: Optional[str] = None,
tokenizer: Optional[GPT2Tokenizer] = None,
**kwargs,
) -> BaseAssemblyGenerator:
"""Create a dataset-specific assembly generator.

Expand Down Expand Up @@ -56,6 +57,7 @@ def create(
mask_path,
analysis_mask_path,
tokenizer,
**kwargs
)

@staticmethod
Expand All @@ -72,6 +74,7 @@ def generate_assembly(
generate_temporal_baseline: bool = False,
analysis_mask_path: Optional[str] = None,
tokenizer: Optional[GPT2Tokenizer] = None,
**kwargs,
) -> SimpleNeuroidAssembly:
"""Generate assembly for a subject using the appropriate dataset processor.

Expand All @@ -98,11 +101,13 @@ def generate_assembly(
mask_path,
analysis_mask_path,
tokenizer,
**kwargs
)
return generator.generate_assembly(
subject,
lookback,
context_type,
correlation_length,
generate_temporal_baseline,
**kwargs
)
17 changes: 12 additions & 5 deletions encoding/assembly/base_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ def _process_fullcontext(
total_len = len(transcript["word_orig"])
ds_data = transcript["word_orig"].astype(str)
stimuli = []
print(f"this is the lookback: {lookback}")
print(f"heloo")
#print(f"this is the lookback: {lookback}")
#print(f"heloo")

for i, w in enumerate(ds_data):
if w != "":
Expand Down Expand Up @@ -336,15 +336,22 @@ def compute_word_rate_features(
return np.array(word_rates) # Shape: (n_trs, 1)

def process_transcript(
self, data_dir: str, story_name: str
self,
data_dir: str,
transcript_file: str,
story_name: str,
) -> Tuple[pd.DataFrame, List[int], np.ndarray, np.ndarray]:
"""Process transcript data and generate split indices and timing information."""
#TODO: Unify file structure for LITcoder
#TODO: make it so stimulis data is in a dictionary instead of a list, or make file store modular
# read pickle file
with open(os.path.join(data_dir, f"{self.dataset_type}_data.pkl"), "rb") as f:
with open(os.path.join(data_dir, f"transcripts/{transcript_file}"), "rb") as f:
data = pickle.load(f)

# this is a list, iterate over it and find the story_name
story = next((s for s in data if s.get("story_name") == story_name), None)
#story = next((s for s in data if s.get("story_name") == story_name), None)
story = data[story_name]

if story is None:
available = [s.get("story_name") for s in data]
raise ValueError(
Expand Down
43 changes: 25 additions & 18 deletions encoding/assembly/lebel_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ def __init__(
mask_path: Optional[str] = None,
analysis_mask_path: Optional[str] = None,
tokenizer: Optional[GPT2Tokenizer] = None,
stories: Optional[List[str]] = None,
**kwargs,
):
super().__init__(data_dir, dataset_type, tr, use_volume, mask_path, tokenizer)
self.analysis_mask = analysis_mask_path
self.stories = [
self.stories = stories if stories is not None else [
"adollshouse",
"adventuresinsayingyes",
"alternateithicatom",
Expand Down Expand Up @@ -65,6 +67,8 @@ def generate_assembly(
context_type: str = "fullcontext",
correlation_length: int = 100,
generate_temporal_baseline: bool = False,
audio_path: Optional[str] = None,
**kwargs,
) -> SimpleNeuroidAssembly:
"""Generate assembly for a subject by processing all stories.

Expand All @@ -81,20 +85,21 @@ def generate_assembly(
self.generate_temporal_baseline = generate_temporal_baseline

# Process each story
#TODO: fix this to load big files once outside loop
for story in self.stories:
audio_path = f"{self.data_dir}/audio_files/{story}.wav"
story_data = self._process_single_story(
subject,
story,
None,
correlation_length,
generate_temporal_baseline,
subject=subject,
story_name=story,
correlation_length=correlation_length,
generate_temporal_baseline=generate_temporal_baseline,
audio_path=audio_path,
brain_resp_file= kwargs.get('brain_resp_file', 'brain_resp_huge.pkl'),
transcript_file= kwargs.get('transcript_file', 'lebel_transcripts.pkl' )
)
story_data_list.append(story_data)

# Create assembly with story-level separation
return SimpleNeuroidAssembly(story_data_list, validation_method="outer")
return SimpleNeuroidAssembly(story_data_list, validation_method="outer",is_volume=self.use_volume)

def _discover_stories(self, subject_dir: Path) -> List[Dict[str, str]]:
"""Discover all stories for a subject from the directory structure.
Expand All @@ -107,14 +112,16 @@ def _process_single_story(
self,
subject: str,
story_name: str,
volume_path: str,
correlation_length: int = 100,
generate_temporal_baseline: bool = False,
audio_path: Optional[str] = None,
brain_resp_file: Optional[str] = None,
transcript_file: Optional[str] = None
) -> StoryData:
"""Process a single story and return its data using a specified context type.

Args:
subject: Subject identifier
story_name: Name of the story being processed
wordseq: Word sequence data for the story
brain_data: Neural activity data for the story
Expand All @@ -125,18 +132,18 @@ def _process_single_story(
Returns:
StoryData object containing processed story information
"""
if self.use_volume:
with open(f"{self.data_dir}/noslice_sub-{subject}_story_data.pkl", "rb") as f:
resp_dict = pickle.load(f)
else:
with open(f"{self.data_dir}/noslice_sub-{subject}_story_data_surface.pkl", "rb") as f:
resp_dict = pickle.load(f)
#TODO: Unify file structure for LITcoder
with open(f"{self.data_dir}/{subject}/{brain_resp_file}", "rb") as f:
resp_dict = pickle.load(f)

brain_data = resp_dict.get(story_name)


transcript, split_indices, tr_times, data_times, _ = self.process_transcript(
self.data_dir,
story_name
)
self.data_dir,
transcript_file,
story_name)

stimuli = self.generate_stimuli_with_context(transcript, self.lookback)

if self.analysis_mask is not None:
Expand Down
3 changes: 2 additions & 1 deletion encoding/assembly/lpp_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(
mask_path: Optional[str] = None,
analysis_mask_path: Optional[str] = None,
tokenizer: Optional[GPT2Tokenizer] = None,
**kwargs,
):
super().__init__(data_dir, dataset_type, tr, use_volume, mask_path, tokenizer)
self.analysis_mask = analysis_mask_path
Expand Down Expand Up @@ -69,7 +70,7 @@ def generate_assembly(
story_data_list.append(story_data)

# Create assembly with run-level separation
return SimpleNeuroidAssembly(story_data_list, validation_method="inner")
return SimpleNeuroidAssembly(story_data_list, validation_method="inner",is_volume=self.use_volume)

def _discover_stories(
self, subject_dir: Path, subject: str
Expand Down
3 changes: 2 additions & 1 deletion encoding/assembly/narratives_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(
mask_path: Optional[str] = None,
analysis_mask_path: Optional[str] = None,
tokenizer: Optional[GPT2Tokenizer] = None,
**kwargs,
):
super().__init__(data_dir, dataset_type, tr, use_volume, mask_path, tokenizer)
self.analysis_mask = analysis_mask_path
Expand Down Expand Up @@ -71,7 +72,7 @@ def generate_assembly(
story_data_list.append(story_data)

# Create assembly with story-level separation
return SimpleNeuroidAssembly(story_data_list, validation_method="inner")
return SimpleNeuroidAssembly(story_data_list, validation_method="inner",is_volume=self.use_volume)

def _discover_stories(self, subject_dir: Path) -> List[Dict[str, str]]:
"""Discover all stories for a subject from the directory structure."""
Expand Down
1 change: 1 addition & 0 deletions encoding/assembly/story_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class StoryData:
Attributes:
name (str): Name identifier for the story/run
brain_data (np.ndarray): Brain activation data, shape (n_timepoints, n_voxels/vertices)
is_volume (bool): true if brain data is volume data, false if surface
stimuli (List[str]): List of text stimuli corresponding to each timepoint
split_indices (List[int]): Indices marking TR boundaries in the data
tr_times (np.ndarray): Array of TR timestamps
Expand Down
Empty file added encoding/data_prep/__init__.py
Empty file.
153 changes: 153 additions & 0 deletions encoding/data_prep/data_prep_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""Utils for preparing datasets before using litcode.
Adapted from Huth ridge utils.
TODO: Add proper citation"""
from typing import List, Tuple, Union
from pathlib import Path
import json
import numpy as np
from .textgrid import TextGrid
import pickle
import h5py

DEFAULT_BAD_WORDS = frozenset(["sentence_start", "sentence_end", "br", "lg", "ls", "ns", "sp"])




##########Transcript Preprocessing Utils###########

def create_lebel_transcripts( story_list: List[str],
textgrids_dir: Union[str, Path],
respdict_path: Union[str, Path],
output_dir: Union[str, Path],
file_name: str = "lebel_transcripts.pkl"
) -> None:

"""Create transcripts for the given stories and save them to the output directory.

Args:
story_list: List of story names to process
textgrids_dir: Directory containing the TextGrid files
respdict_path: Path to the response dictionary JSON file
output_dir: Directory to save the generated transcripts
"""

text_grids = _load_textgrids(story_list, textgrids_dir)
with open(respdict_path, "r") as f:
respdict = json.load(f)
tr_times = _simulate_trtimes(story_list, respdict)
processed_transcripts = _process_textgrids(text_grids, tr_times)

output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / file_name
with open(output_path, "wb") as f:
pickle.dump(processed_transcripts, f)


def _load_textgrids(stories: List[str], textgrids_dir: Union[str, Path]) -> dict:
"""Load TextGrid files for the given stories from the specified TextGrid directory.

Args:
stories: List of story names
data_dir: Directory containing the TextGrid files

Returns:
Dictionary mapping story names to their corresponding TextGrid objects
"""

grids = {}
for story in stories:
grid_path = Path(textgrids_dir) / f"{story}.TextGrid"
grids[story] = TextGrid.load(grid_path)
return grids


def _simulate_trtimes(stories: List[str], respdict: dict, tr: float = 2.0, start_time: float = 10.0, pad: int = 10) -> dict:
"""Simulate TR times for the given stories based on the response dictionary.

Args:
stories: List of story names
respdict: Dictionary mapping story names to their response lengths
tr: Expected TR value
start_time: Start time for the simulation
pad: Padding to subtract from the response length

Returns:
Dictionary mapping story names to their simulated TR times
"""
tr_times = {}
for story in stories:
resp_length = respdict.get(story, 0)
tr_times[story] = list(np.arange(-start_time, (resp_length - pad) * tr, tr))
return tr_times

def _process_textgrids(text_grids: dict,
tr_times: dict,
bad_words: frozenset = DEFAULT_BAD_WORDS
) -> dict[dict]:
"""Process the loaded TextGrid files to extract word sequences, filtering out bad words.

Args:
text_grids: Dictionary mapping story names to their corresponding TextGrid objects
bad_words: Set of words to filter out from the transcripts
"""
processed_transcripts = {}
for story in text_grids.keys():
simple_transcript = text_grids[story].tiers[1].make_simple_transcript()
## Filter out bad words
filtered_transcript = [x for x in simple_transcript if x[2].lower().strip("{}").strip() not in bad_words]
# Further processing can be done here as needed
processed_transcripts[story] = _process_single_story(filtered_transcript, tr_times[story])

return processed_transcripts

def _process_single_story(processed_transcript: List[Tuple],
tr_times: List[float]) -> dict:
"""Process a single story's transcript and TR times to create a structured representation.
Args:
proceesed_transcript: List of tuples representing the transcript (start_time, end_time, word)
tr_times: List of TR times for the story
Returns:
Tuple containing processed story information
"""

data_entries = list(zip(*processed_transcript))[2]
if isinstance(data_entries[0], str):
data = list(map(str.lower, list(zip(*processed_transcript))[2]))
else:
data = data_entries
word_starts = np.array(list(map(float, list(zip(*processed_transcript))[0])))
word_ends = np.array(list(map(float, list(zip(*processed_transcript))[1])))
word_avgtimes = (word_starts + word_ends)/2.0

tr = np.mean(np.diff(tr_times))
tr_midpoints = np.array(tr_times) + tr/2.0

split_inds = [(word_starts<(t+tr)).sum() for t in tr_times][:-1]
return {"words": data, "split_indices": split_inds, "data_times":word_avgtimes,"tr_times": tr_midpoints}

def create_brain_response_dict(story_list: List[str],
resp_data_dir: Union[str, Path],
output_dir: Union[str, Path],
file_name: str = "brain_resp_huge.pkl"
) -> None:
"""Create a dictionary of brain responses for the given stories and save it to the output path.

Args:
story_list: List of story names to process
neural_data_dir: Directory containing the neural data files
output_path: Path to save the generated brain response dictionary
file_name: Name of the output file
"""

brain_responses = {}
for story in story_list:
resp_data_path = Path(resp_data_dir) / f"{story}.hf5"
with h5py.File(resp_data_path, "r") as f:
brain_responses[story] = f["data"][:]

output_dir = Path(output_dir)
output_dir.parent.mkdir(parents=True, exist_ok=True)
with open(output_dir / file_name, "wb") as f:
pickle.dump(brain_responses, f)
Loading