-
Notifications
You must be signed in to change notification settings - Fork 33
feat: extract brain area anatomy from NWB location fields #1807
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
bendichter
wants to merge
2
commits into
master
Choose a base branch
from
add-brain-area-anatomy
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| #!/usr/bin/env python3 | ||
| """Regenerate allen_ccf_structures.json from Allen Brain Map API. | ||
|
|
||
| Run: python -m dandi.data.generate_allen_structures | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| from pathlib import Path | ||
|
|
||
| import requests | ||
|
|
||
|
|
||
| def _flatten(node: dict, out: list[dict]) -> None: | ||
| out.append({"id": node["id"], "acronym": node["acronym"], "name": node["name"]}) | ||
| for child in node.get("children", []): | ||
| _flatten(child, out) | ||
|
|
||
|
|
||
| def main() -> None: | ||
| url = "http://api.brain-map.org/api/v2/structure_graph_download/1.json" | ||
| resp = requests.get(url, timeout=30) | ||
| resp.raise_for_status() | ||
| data = resp.json() | ||
| structures: list[dict] = [] | ||
| root = data["msg"][0] | ||
| _flatten(root, structures) | ||
| structures.sort(key=lambda s: s["id"]) | ||
| out_path = Path(__file__).with_name("allen_ccf_structures.json") | ||
| with open(out_path, "w") as f: | ||
| json.dump(structures, f, separators=(",", ":")) | ||
| print(f"Wrote {len(structures)} structures to {out_path}") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,221 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import ast | ||
| from functools import lru_cache | ||
| import json | ||
| from pathlib import Path | ||
| import re | ||
| from typing import Any | ||
|
|
||
| from dandischema import models | ||
|
|
||
| from .. import get_logger | ||
|
|
||
| lgr = get_logger() | ||
|
|
||
| MBAO_URI_TEMPLATE = "http://purl.obolibrary.org/obo/MBA_{}" | ||
|
|
||
| # Values that should be treated as missing / uninformative | ||
| _TRIVIAL_VALUES = frozenset( | ||
| { | ||
| "", | ||
| "unknown", | ||
| "none", | ||
| "n/a", | ||
| "na", | ||
| "null", | ||
| "unspecified", | ||
| "not available", | ||
| "not applicable", | ||
| "brain", | ||
| } | ||
| ) | ||
|
|
||
|
|
||
| @lru_cache(maxsize=1) | ||
| def _load_allen_structures() -> list[dict[str, Any]]: | ||
| """Load the bundled Allen CCF structures JSON.""" | ||
| data_path = ( | ||
| Path(__file__).resolve().parent.parent / "data" / "allen_ccf_structures.json" | ||
| ) | ||
| with open(data_path) as f: | ||
| structures: list[dict[str, Any]] = json.load(f) | ||
| return structures | ||
|
|
||
|
|
||
| @lru_cache(maxsize=1) | ||
| def _build_lookup_dicts() -> ( | ||
| tuple[dict[str, dict], dict[str, dict], dict[str, dict], dict[str, dict]] | ||
| ): | ||
| """Build lookup dictionaries for Allen CCF structures. | ||
|
|
||
| Returns | ||
| ------- | ||
| tuple of 4 dicts | ||
| (acronym_exact, acronym_lower, name_exact, name_lower) | ||
| """ | ||
| structures = _load_allen_structures() | ||
| acronym_exact: dict[str, dict] = {} | ||
| acronym_lower: dict[str, dict] = {} | ||
| name_exact: dict[str, dict] = {} | ||
| name_lower: dict[str, dict] = {} | ||
| for s in structures: | ||
| acr = s["acronym"] | ||
| name = s["name"] | ||
| # First match wins (structures are sorted by id) | ||
| if acr not in acronym_exact: | ||
| acronym_exact[acr] = s | ||
| acr_low = acr.lower() | ||
| if acr_low not in acronym_lower: | ||
| acronym_lower[acr_low] = s | ||
| if name not in name_exact: | ||
| name_exact[name] = s | ||
| name_low = name.lower() | ||
| if name_low not in name_lower: | ||
| name_lower[name_low] = s | ||
| return acronym_exact, acronym_lower, name_exact, name_lower | ||
|
|
||
|
|
||
| def _parse_location_string(location: str) -> list[str]: | ||
| """Parse a raw NWB location string into area tokens. | ||
|
|
||
| Handles: | ||
| - Simple strings: ``"VISp"`` | ||
| - Dict literals: ``"{'area': 'VISp', 'depth': '20'}"`` | ||
| - Key-value pairs: ``"area: VISp, depth: 175"`` | ||
| - Comma-separated lists: ``"VISp,VISrl,VISlm"`` | ||
| """ | ||
| location = location.strip() | ||
| if not location or location.lower() in _TRIVIAL_VALUES: | ||
| return [] | ||
|
|
||
| # Try dict literal (e.g. "{'area': 'VISp', 'depth': 20}") | ||
| if location.startswith("{"): | ||
| try: | ||
| d = ast.literal_eval(location) | ||
| if isinstance(d, dict): | ||
| # Look for known area keys | ||
| for key in ("area", "location", "region", "brain_area", "brain_region"): | ||
| val = d.get(key) | ||
| if val is not None: | ||
| val = str(val).strip() | ||
| if val and val.lower() not in _TRIVIAL_VALUES: | ||
| return [val] | ||
| # If no known key, return all string values that are non-trivial | ||
| tokens = [] | ||
| for val in d.values(): | ||
| val = str(val).strip() | ||
| if val and val.lower() not in _TRIVIAL_VALUES: | ||
| # Skip purely numeric values (e.g. depth) | ||
| try: | ||
| float(val) | ||
| except ValueError: | ||
| tokens.append(val) | ||
| return tokens | ||
| except (ValueError, SyntaxError): | ||
| pass # Not a valid dict literal; fall through to other parsers | ||
|
|
||
| # Try key-value format (e.g. "area: VISp, depth: 175") | ||
| if re.search(r"\w+\s*:", location) and "://" not in location: | ||
| pairs = re.split(r",\s*", location) | ||
| kv: dict[str, str] = {} | ||
| for pair in pairs: | ||
| m = re.match(r"(\w+)\s*:\s*(.+)", pair.strip()) | ||
| if m: | ||
| kv[m.group(1).lower()] = m.group(2).strip() | ||
| if kv: | ||
| for key in ("area", "location", "region", "brain_area", "brain_region"): | ||
| val = kv.get(key) | ||
| if val is not None and val.lower() not in _TRIVIAL_VALUES: | ||
| return [val] | ||
| # Fall through — return non-trivial, non-numeric values | ||
| tokens = [] | ||
| for val in kv.values(): | ||
| if val.lower() not in _TRIVIAL_VALUES: | ||
| try: | ||
| float(val) | ||
| except ValueError: | ||
| tokens.append(val) | ||
| if tokens: | ||
| return tokens | ||
|
|
||
| # Comma-separated list | ||
| if "," in location: | ||
| tokens = [t.strip() for t in location.split(",")] | ||
| return [t for t in tokens if t and t.lower() not in _TRIVIAL_VALUES] | ||
|
|
||
| # Simple string | ||
| return [location] | ||
|
|
||
|
|
||
| def match_location_to_allen(token: str) -> models.Anatomy | None: | ||
| """Match a single location token against Allen CCF structures. | ||
|
|
||
| Tries exact acronym, case-insensitive acronym, exact name, | ||
| case-insensitive name in that order. | ||
|
|
||
| Returns | ||
| ------- | ||
| models.Anatomy or None | ||
| """ | ||
| acronym_exact, acronym_lower, name_exact, name_lower = _build_lookup_dicts() | ||
| token_stripped = token.strip() | ||
| if not token_stripped: | ||
| return None | ||
|
|
||
| # 1. Exact acronym match | ||
| s = acronym_exact.get(token_stripped) | ||
| if s is not None: | ||
| return _structure_to_anatomy(s) | ||
|
|
||
| # 2. Case-insensitive acronym match | ||
| s = acronym_lower.get(token_stripped.lower()) | ||
| if s is not None: | ||
| return _structure_to_anatomy(s) | ||
|
|
||
| # 3. Exact name match | ||
| s = name_exact.get(token_stripped) | ||
| if s is not None: | ||
| return _structure_to_anatomy(s) | ||
|
|
||
| # 4. Case-insensitive name match | ||
| s = name_lower.get(token_stripped.lower()) | ||
| if s is not None: | ||
| return _structure_to_anatomy(s) | ||
|
|
||
| lgr.debug("Could not match brain location %r to Allen CCF", token_stripped) | ||
| return None | ||
|
|
||
|
|
||
| def _structure_to_anatomy(s: dict[str, Any]) -> models.Anatomy: | ||
| return models.Anatomy( | ||
| identifier=MBAO_URI_TEMPLATE.format(s["id"]), | ||
| name=s["name"], | ||
| ) | ||
|
|
||
|
|
||
| def locations_to_anatomy(locations: list[str]) -> list[models.Anatomy]: | ||
| """Convert raw NWB location strings to deduplicated Anatomy list. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| locations : list[str] | ||
| Raw location strings from NWB file. | ||
|
|
||
| Returns | ||
| ------- | ||
| list[models.Anatomy] | ||
| Matched and deduplicated anatomy entries. | ||
| """ | ||
| seen_ids: set[str] = set() | ||
| results: list[models.Anatomy] = [] | ||
| for loc in locations: | ||
| tokens = _parse_location_string(loc) | ||
| for token in tokens: | ||
| anatomy = match_location_to_allen(token) | ||
| if anatomy is not None: | ||
| id_str = str(anatomy.identifier) | ||
| if id_str not in seen_ids: | ||
| seen_ids.add(id_str) | ||
| results.append(anatomy) | ||
| return results | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.