From da7f64a9bef9910a69781eda73ffba7aaa268744 Mon Sep 17 00:00:00 2001
From: Daniel Suveges <daniel.suveges@protonmail.com>
Date: Mon, 9 Feb 2026 19:21:08 +0000
Subject: [PATCH 1/4] refactor: extract process_gencode_data into testable
 static methods

Break the monolithic process_gencode_data into 6 independently testable
static methods (parse_raw_gtf, parse_gtf_annotations, strip_gene_id_version,
filter_protein_coding_genes, add_length_column, process_single_gene) and
fix the chromosome filter to handle the 'chr' prefix in standard GTF files.

Add unit tests for each extracted method plus integration tests using a
sample GENCODE GTF (OPRK1 / ENSG00000082556). Move config.json to repo root.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/genome_plotter/config.json => config.json |   0
 .../input_parsers/fetch_gencode.py            | 247 +++++----
 src/tests/sample_data/gencode_sample.gtf.gz   | Bin 0 -> 1458 bytes
 src/tests/test_fetch_gencode.py               | 478 ++++++++++++++++++
 4 files changed, 634 insertions(+), 91 deletions(-)
 rename src/genome_plotter/config.json => config.json (100%)
 create mode 100644 src/tests/sample_data/gencode_sample.gtf.gz
 create mode 100644 src/tests/test_fetch_gencode.py

diff --git a/src/genome_plotter/config.json b/config.json
similarity index 100%
rename from src/genome_plotter/config.json
rename to config.json
diff --git a/src/genome_plotter/input_parsers/fetch_gencode.py b/src/genome_plotter/input_parsers/fetch_gencode.py
index 4a09932..f412fbd 100644
--- a/src/genome_plotter/input_parsers/fetch_gencode.py
+++ b/src/genome_plotter/input_parsers/fetch_gencode.py
@@ -83,6 +83,26 @@ def _get_gencode_version(file_list: list[str]) -> int:
         # Get the latest release:
         return max(releases)
 
+    @staticmethod
+    def parse_raw_gtf(tsv_data: pd.DataFrame) -> pd.DataFrame:
+        """Select and rename GTF columns from a raw tab-separated DataFrame.
+
+        Args:
+            tsv_data (pd.DataFrame): Raw DataFrame read from a GTF file (no header, comment lines skipped).
+
+        Returns:
+            pd.DataFrame: DataFrame with columns: chr, type, start, end, strand, annotation.
+        """
+        columns = {
+            0: "chr",
+            2: "type",
+            3: "start",
+            4: "end",
+            6: "strand",
+            8: "annotation",
+        }
+        return tsv_data.rename(columns=columns)[columns.values()]
+
     def retrieve_data(self: FetchGencode) -> None:
         """Retrieving release data and association table. Then the connection is closed.
 
@@ -108,16 +128,8 @@ def retrieve_data(self: FetchGencode) -> None:
             header=None,
         )
 
-        # Parse data:
-        columns = {
-            0: "chr",
-            2: "type",
-            3: "start",
-            4: "end",
-            6: "strand",
-            8: "annotation",
-        }
-        self.gencode_raw = self.tsv_data.rename(columns=columns)[columns.values()]
+        # Parse raw GTF into structured columns:
+        self.gencode_raw = self.parse_raw_gtf(self.tsv_data)
 
         # Close connection:
         self.close_connection()
@@ -128,55 +140,152 @@ def retrieve_data(self: FetchGencode) -> None:
         gene_count = len(self.gencode_raw.loc[self.gencode_raw.type == "gene"])
         logger.info(f"Number of genes in {self.release} release: {gene_count:,}")
 
-    def process_gencode_data(self: FetchGencode) -> None:
-        """Process the raw GENCODE data into structured gene annotations."""
-        # Parsing gtf annotation:
-        logger.info("Parsing GTF annotation.")
-        parsed_annotation = self.gencode_raw.annotation.apply(
+    @staticmethod
+    def parse_gtf_annotations(gencode_raw: pd.DataFrame) -> pd.DataFrame:
+        """Parse GTF annotation strings into columns and merge with coordinates.
+
+        Args:
+            gencode_raw (pd.DataFrame): Raw GENCODE data with an 'annotation' column.
+
+        Returns:
+            pd.DataFrame: DataFrame with parsed annotation columns, 'annotation' column dropped.
+        """
+        parsed_annotation = gencode_raw.annotation.apply(
             lambda annotation: {
                 x.strip().split(" ", 1)[0]: x.strip().split(" ", 1)[1].replace('"', "")
                 for x in annotation.split(";")
                 if x != ""
             }
         )
-
-        # Merging annotation with coordinates:
-        gencode_df_updated = self.gencode_raw.merge(
+        df = gencode_raw.merge(
             pd.DataFrame(parsed_annotation.tolist()), left_index=True, right_index=True
         )
+        df.drop(["annotation"], axis=1, inplace=True)
+        return df
 
-        # Drop unparsed annotation column:
-        gencode_df_updated.drop(["annotation"], axis=1, inplace=True)
+    @staticmethod
+    def strip_gene_id_version(df: pd.DataFrame) -> pd.DataFrame:
+        """Strip `.version` suffix from gene_id column.
+
+        Args:
+            df (pd.DataFrame): DataFrame with a 'gene_id' column.
+
+        Returns:
+            pd.DataFrame: DataFrame with version-stripped gene_id.
+        """
+        return df.assign(gene_id=df.gene_id.str.split(".").str[0])
+
+    @staticmethod
+    def filter_protein_coding_genes(df: pd.DataFrame) -> pd.DataFrame:
+        """Filter for protein-coding genes on conventional chromosomes.
+
+        Applies four filters:
+        - Conventional chromosomes (bare name after stripping 'chr' prefix is <= 2 chars)
+        - protein_coding gene_type
+        - gene_name does not start with 'ENSG'
+        - gene_name does not contain 'orf'
+
+        Args:
+            df (pd.DataFrame): DataFrame with chr, gene_type, and gene_name columns.
 
-        # Removing gene identifier version:
-        gencode_df_updated = gencode_df_updated.assign(
-            gene_id=gencode_df_updated.gene_id.str.split(".").str[0]
+        Returns:
+            pd.DataFrame: Filtered DataFrame.
+        """
+        return df.loc[
+            (df.chr.str.replace(r"^chr", "", regex=True).str.len() <= 2)
+            & (df.gene_type == "protein_coding")
+            & (~df.gene_name.str.startswith("ENSG"))
+            & (~df.gene_name.str.contains("orf"))
+        ]
+
+    @staticmethod
+    def add_length_column(df: pd.DataFrame) -> pd.DataFrame:
+        """Cast start/end to int and add a length column.
+
+        Args:
+            df (pd.DataFrame): DataFrame with 'start' and 'end' columns.
+
+        Returns:
+            pd.DataFrame: DataFrame with int-typed start/end and a new 'length' column.
+        """
+        df = df.copy()
+        df["start"] = df.start.astype(int)
+        df["end"] = df.end.astype(int)
+        return df.assign(length=lambda row: row["end"] - row["start"])
+
+    @staticmethod
+    def process_single_gene(
+        gene_id: str, features: pd.DataFrame
+    ) -> tuple[pd.DataFrame, pd.DataFrame] | None:
+        """Process a single gene: select canonical transcript, build arrow and exon/intron data.
+
+        Args:
+            gene_id (str): The gene identifier.
+            features (pd.DataFrame): All features (transcripts, exons, CDS, UTR) for the gene.
+
+        Returns:
+            tuple[pd.DataFrame, pd.DataFrame] | None: (gene_df, arrow_df) or None if no
+                protein-coding transcript is found.
+        """
+        transcripts = features.loc[
+            (features.type == "transcript")
+            & (features.transcript_type == "protein_coding")
+        ]
+
+        if len(transcripts) == 0:
+            return None
+
+        cds_length = transcripts.transcript_id.apply(
+            lambda t_id: features.loc[
+                (features.type == "CDS") & (features.transcript_id == t_id)
+            ].length.sum()
+        )
+        transcripts = transcripts.copy()
+        transcripts.insert(2, "cds_length", cds_length)
+
+        canonical_transcript_id = FetchGencode.get_canonical_transcript(transcripts)
+        [start, end] = (
+            transcripts.loc[
+                transcripts.transcript_id == canonical_transcript_id,
+                ["start", "end"],
+            ]
+            .iloc[0]
+            .tolist()
         )
 
-        # Cleaning up genes:
-        gencode_df_updated = gencode_df_updated.loc[
-            # Dropping entries on non-conventional chromosomes:
-            (gencode_df_updated.chr.str.len <= 2)
-            # Filtering for protein coding genes:
-            & (gencode_df_updated.gene_type == "protein_coding")
-            # Dropping novel genes without proper gene name:
-            & (~gencode_df_updated.gene_name.str.startswith("ENSG"))
-            # Dropping novel genes without proper gene name:
-            & (~gencode_df_updated.gene_name.str.contains("orf"))
+        arrow_df = features.loc[
+            (features.transcript_id == canonical_transcript_id)
+            & (features.type.isin(["CDS", "UTR"])),
+            ["chr", "start", "end", "strand", "type", "gene_id", "gene_name"],
         ]
-        protein_coding_gene_count = len(
-            gencode_df_updated.loc[gencode_df_updated.type == "gene"]
+
+        gene_df = FetchGencode.generate_exon_intron_structure(
+            gene_id,
+            canonical_transcript_id,
+            start,
+            end,
+            features.loc[
+                (features.transcript_id == canonical_transcript_id)
+                & (features.type == "exon")
+            ],
         )
-        logger.info(f"Number of protein coding genes: {protein_coding_gene_count:,}")
 
-        # Updating types:
-        gencode_df_updated["start"] = gencode_df_updated.start.astype(int)
-        gencode_df_updated["end"] = gencode_df_updated.end.astype(int)
+        return gene_df, arrow_df
 
-        # Adding length to all features:
-        gencode_df_updated = gencode_df_updated.assign(
-            length=lambda row: row["end"] - row["start"]
+    def process_gencode_data(self: FetchGencode) -> None:
+        """Process the raw GENCODE data into structured gene annotations."""
+        logger.info("Parsing GTF annotation.")
+        gencode_df = self.parse_gtf_annotations(self.gencode_raw)
+
+        gencode_df = self.strip_gene_id_version(gencode_df)
+
+        gencode_df = self.filter_protein_coding_genes(gencode_df)
+        protein_coding_gene_count = len(
+            gencode_df.loc[gencode_df.type == "gene"]
         )
+        logger.info(f"Number of protein coding genes: {protein_coding_gene_count:,}")
+
+        gencode_df = self.add_length_column(gencode_df)
 
         # Initialize empty dataframes:
         processed = pd.DataFrame(
@@ -198,57 +307,13 @@ def process_gencode_data(self: FetchGencode) -> None:
             "Generate exon/intron annotations for the canonical transcripts for each gene... (it will take a while.)"
         )
 
-        for (gene_id,), features in gencode_df_updated.groupby(["gene_id"]):
-            # Selecting protein coding transcript identifiers:
-            transcripts = features.loc[
-                (features.type == "transcript")
-                & (features.transcript_type == "protein_coding")
-            ]
-
-            # If no protein coding transcript is found, we skip gene:
-            if len(transcripts) == 0:
+        for (gene_id,), features in gencode_df.groupby(["gene_id"]):
+            result = self.process_single_gene(gene_id, features)
+            if result is None:
                 continue
-
-            # Adding the length of the CDS to all transcripts:
-            cds_length = transcripts.transcript_id.apply(
-                lambda t_id: features.loc[
-                    (features.type == "CDS") & (features.transcript_id == t_id)
-                ].length.sum()
-            )
-            transcripts.insert(2, "cds_length", cds_length)
-
-            # Get canonical transcript and properties:
-            canonical_transcript_id = self.get_canonical_transcript(transcripts)
-            [start, end] = (
-                transcripts.loc[
-                    transcripts.transcript_id == canonical_transcript_id,
-                    ["start", "end"],
-                ]
-                .iloc[0]
-                .tolist()
-            )
-
-            # Get data for the arrow plot:
-            arrow_part = features.loc[
-                (features.transcript_id == canonical_transcript_id)
-                & (features.type.isin(["CDS", "UTR"])),
-                ["chr", "start", "end", "strand", "type", "gene_id", "gene_name"],
-            ]
-            arrow_data = pd.concat([arrow_data, arrow_part])
-
-            # Generate exon-intron splice:
-            gene_df = self.generate_exon_intron_structure(
-                gene_id,
-                canonical_transcript_id,
-                start,
-                end,
-                features.loc[
-                    (features.transcript_id == canonical_transcript_id)
-                    & (features.type == "exon")
-                ],
-            )
-            # appending to existing data:
+            gene_df, arrow_df = result
             processed = pd.concat([processed, gene_df])
+            arrow_data = pd.concat([arrow_data, arrow_df])
 
         # Saving data:
         self.processed = processed
diff --git a/src/tests/sample_data/gencode_sample.gtf.gz b/src/tests/sample_data/gencode_sample.gtf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..7d16694b0e97818919fc2f8a76be668df99ef3cb
GIT binary patch
literal 1458
zcmV;j1x@-NiwFScFN$dZ1MQqqPoz2+hOgaULB^Zi<Vf1mLaR4x+=(V9>l$^>EvZa(
z$j-ooL9_n*Qy5SjnH>~xf%%LYW`I)qytH4x7uesQsHGSAi`Wl?Lqk3I6BT&s;6^Uh
z#E?M{#xhxDgttaKRmSvwE(7CE1>sy7pHDAOrWXJ5kMW?QWwI3&#^-UcRP&pefie5?
zX@7qag-M~&p>aAsKl$-(tiSWZAeLTym=xrN6Lm0Ev&C0+<D1em?|<nnZo_CUlMUkA
zPu?`*duPZ09iJcX=^fM_vj|}hp~(Wnw&m=Zdk6Y2S?T%{V>CX$Iz{OZCyd!9rlTVx
zT^m0xlrhqEij^O@`q+MOm#+(?KBv)-i_3qp{%WQkls<_gV|o{OX{l$Y=O>38v!t-8
ze2{^3ll}BA*Vku1zpsvnZAu*5v_?lS>K#WiShe#zYyVt8zRh*I39*?j%;axB!9s}}
zb}DR9*!6&*=k_5-M!s8Ct&3u{te2%M?-z?Oirv+|qokP9){W(zG13zBz5GKaD%X=i
z82Fx?<ts8#xfhW?m(gSX_4niRv3sRv%8T=_<YE!|OIQC3Jbxi)F3#7SoP4{|%bqt`
zCN)Vu+L#d&R|B1I^L(QDmMcV-!x7~sS(AkN9R@26P|ABQhU8q_;C_CiqU4fEKV~%>
zr%k{q(QHi+PLskx4#z9uAkO+P98w}2OsLHTPg@IwQ&TwlDC@e(IvfPL0o`oW%`%p{
zDKwaTz(N)S{eXVnI$&AUVIo~%7~}pASXNo2qpQv_f=H*q0joqe7<2==+1lYe&9jN%
zMD)MIp(Vm0*b>;WKsXI{I7C<oz0?f>-GFZX5Fv+)=Kz@qBAk3d^glob`LrybpVya=
z2?X7|UpKm0ArT0=d82L$;c-4r^;$PDPPMF?)?7^6u?T0$@jg&KY^x*~LXN$v)@JCM
zu2#v-+>c{5adQW~R1z7rbon`CbR#7v@iYqW@1|=^1=W{kQ&1UV7EViR+lTyXIGc+;
zb_;!TPGTp}*9#za0)6vdAdv3|`S#pAP(VKv(1IV=3H0@Rm7PG}yrTx>`$4`1PplK@
z>rMK10{xS#rQ4+$R}0AZgZwLB3Zs01^gq-C0c;gO^UhKy$U|<YI{C)Dp-zy8>0MRw
zMRzCb?R!lgdi$;9*Zdjoh%C2f<9eMw(`rIIt$keq=r&#|{p-%v#Ft4sPn0Jg|D=TN
z*R}5XiG|Iz16K9!nlj8pC2ytoD7Fp6m~w0rnqF%?$JU%L(k*QDM$xWfYu?iUYVA{N
z&yN6lAJ<jtVl1&*5H+LCZec4+E$et1GcGEGon6J2b(Kl=$V+PP^#Op|M|73NbY7C~
z5IIFWpq_G=@LF(PyM?W6f^OMVa83t+?P)e$-#6<PHdcPM*@)Xc##WN}hKYB7s5O?f
z?iRIS&H*xNg(*NDj|_v_FyoPo8iXT#qz2JQuc)mVGeVztddYFes+%j(HRC300A6)x
zjy`d~n>biV5bM1hUb}7TcQWq{Zu8G&k3Uf^Sd-5`F|>IK#~J252e>VmBp4B4<HAL^
ztFW6KAUgoNEC#ixaI}(l7k0^g(Wq`p4On-R^_t;iF(iQbz?nZ~@$%5CiMY7~us2Li
z5wdp}cJps>0@ht+-P)U%0C`uD=ZPzrgzXM%9hwoCgY5+CLWoxPtwfz*4SjeC>sJZU
z)su+~)7m$WA;O&xMqb5m6%Mj%VU)92MS+a{2jwzXUo543H@sZxuf9o&;VO4n_m=$j
zL0q?IrH52NCGFqP4I&wsNZP!A9K<p(v9x09bU-Dg6N<_&KmnDs{gxaM$>2m%uoTu`
ztJ3SPWYc_=(rS0$fJ)l-B0wYq6G@9E254nqT6ukoJS0v)BpXxdxUp1v2SfrrNnMfr
M4~uu-lJI>104+($wg3PC

literal 0
HcmV?d00001

diff --git a/src/tests/test_fetch_gencode.py b/src/tests/test_fetch_gencode.py
new file mode 100644
index 0000000..e835396
--- /dev/null
+++ b/src/tests/test_fetch_gencode.py
@@ -0,0 +1,478 @@
+"""Tests for the FetchGencode data processing methods."""
+
+from __future__ import annotations
+
+import os
+import unittest
+
+import pandas as pd
+
+from genome_plotter.input_parsers.fetch_gencode import FetchGencode
+
+SAMPLE_GTF = os.path.join(
+    os.path.dirname(__file__), "sample_data", "gencode_sample.gtf.gz"
+)
+
+
+class TestParseGtfAnnotations(unittest.TestCase):
+    """Test cases for FetchGencode.parse_gtf_annotations."""
+
+    def test_basic_parsing(self) -> None:
+        """Annotation strings are split into separate columns."""
+        df = pd.DataFrame(
+            {
+                "chr": ["1", "1"],
+                "type": ["gene", "transcript"],
+                "start": [100, 100],
+                "end": [200, 200],
+                "strand": ["+", "+"],
+                "annotation": [
+                    'gene_id "ENSG00000001.1"; gene_name "ABC"; gene_type "protein_coding"',
+                    'gene_id "ENSG00000001.1"; gene_name "ABC"; transcript_id "ENST001"',
+                ],
+            }
+        )
+        result = FetchGencode.parse_gtf_annotations(df)
+
+        self.assertNotIn("annotation", result.columns)
+        self.assertIn("gene_id", result.columns)
+        self.assertIn("gene_name", result.columns)
+        self.assertEqual(result.iloc[0]["gene_id"], "ENSG00000001.1")
+        self.assertEqual(result.iloc[0]["gene_name"], "ABC")
+        self.assertEqual(result.iloc[0]["gene_type"], "protein_coding")
+
+    def test_preserves_coordinate_columns(self) -> None:
+        """Coordinate columns are preserved after parsing."""
+        df = pd.DataFrame(
+            {
+                "chr": ["1"],
+                "type": ["gene"],
+                "start": [100],
+                "end": [200],
+                "strand": ["+"],
+                "annotation": ['gene_id "ENSG001"; gene_name "X"'],
+            }
+        )
+        result = FetchGencode.parse_gtf_annotations(df)
+        for col in ["chr", "type", "start", "end", "strand"]:
+            self.assertIn(col, result.columns)
+
+
+class TestStripGeneIdVersion(unittest.TestCase):
+    """Test cases for FetchGencode.strip_gene_id_version."""
+
+    def test_version_stripped(self) -> None:
+        """Version suffixes are removed from gene_id."""
+        df = pd.DataFrame({"gene_id": ["ENSG00000001.5", "ENSG00000002.12"]})
+        result = FetchGencode.strip_gene_id_version(df)
+        self.assertEqual(result.gene_id.tolist(), ["ENSG00000001", "ENSG00000002"])
+
+    def test_no_version(self) -> None:
+        """gene_id without version suffix is unchanged."""
+        df = pd.DataFrame({"gene_id": ["ENSG00000001"]})
+        result = FetchGencode.strip_gene_id_version(df)
+        self.assertEqual(result.gene_id.tolist(), ["ENSG00000001"])
+
+
+class TestFilterProteinCodingGenes(unittest.TestCase):
+    """Test cases for FetchGencode.filter_protein_coding_genes."""
+
+    def _make_df(self, chr: str, gene_type: str, gene_name: str) -> pd.DataFrame:
+        return pd.DataFrame(
+            {
+                "chr": [chr],
+                "gene_type": [gene_type],
+                "gene_name": [gene_name],
+                "type": ["gene"],
+            }
+        )
+
+    def test_keeps_valid_row(self) -> None:
+        """Standard protein-coding gene on conventional chromosome passes."""
+        df = self._make_df("1", "protein_coding", "BRCA1")
+        result = FetchGencode.filter_protein_coding_genes(df)
+        self.assertEqual(len(result), 1)
+
+    def test_filters_non_conventional_chr(self) -> None:
+        """Entries on non-conventional chromosomes are removed."""
+        df = self._make_df("KI270757.1", "protein_coding", "BRCA1")
+        result = FetchGencode.filter_protein_coding_genes(df)
+        self.assertEqual(len(result), 0)
+
+    def test_filters_non_protein_coding(self) -> None:
+        """Non-protein_coding gene_type entries are removed."""
+        df = self._make_df("1", "lncRNA", "XIST")
+        result = FetchGencode.filter_protein_coding_genes(df)
+        self.assertEqual(len(result), 0)
+
+    def test_filters_ensg_gene_name(self) -> None:
+        """Gene names starting with ENSG are removed."""
+        df = self._make_df("1", "protein_coding", "ENSG00000123456")
+        result = FetchGencode.filter_protein_coding_genes(df)
+        self.assertEqual(len(result), 0)
+
+    def test_filters_orf_gene_name(self) -> None:
+        """Gene names containing 'orf' are removed."""
+        df = self._make_df("1", "protein_coding", "C1orf123")
+        result = FetchGencode.filter_protein_coding_genes(df)
+        self.assertEqual(len(result), 0)
+
+
+class TestAddLengthColumn(unittest.TestCase):
+    """Test cases for FetchGencode.add_length_column."""
+
+    def test_length_computed(self) -> None:
+        """Length column is computed as end - start."""
+        df = pd.DataFrame({"start": ["100", "200"], "end": ["300", "500"]})
+        result = FetchGencode.add_length_column(df)
+        self.assertEqual(result.length.tolist(), [200, 300])
+
+    def test_types_cast_to_int(self) -> None:
+        """Start and end columns are cast to int."""
+        df = pd.DataFrame({"start": ["100"], "end": ["200"]})
+        result = FetchGencode.add_length_column(df)
+        self.assertTrue(pd.api.types.is_integer_dtype(result.start))
+        self.assertTrue(pd.api.types.is_integer_dtype(result.end))
+
+    def test_does_not_modify_original(self) -> None:
+        """Original DataFrame is not mutated."""
+        df = pd.DataFrame({"start": ["100"], "end": ["200"]})
+        FetchGencode.add_length_column(df)
+        self.assertNotIn("length", df.columns)
+
+
+class TestProcessSingleGene(unittest.TestCase):
+    """Test cases for FetchGencode.process_single_gene."""
+
+    def _make_gene_features(self) -> pd.DataFrame:
+        """Create a minimal multi-feature gene fixture."""
+        gene_id = "ENSG00000001"
+        transcript_id = "ENST00000001"
+        return pd.DataFrame(
+            [
+                {
+                    "chr": "chr1",
+                    "type": "gene",
+                    "start": 100,
+                    "end": 400,
+                    "strand": "+",
+                    "gene_id": gene_id,
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": "",
+                    "transcript_type": "",
+                    "length": 300,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+                {
+                    "chr": "chr1",
+                    "type": "transcript",
+                    "start": 100,
+                    "end": 400,
+                    "strand": "+",
+                    "gene_id": gene_id,
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": transcript_id,
+                    "transcript_type": "protein_coding",
+                    "length": 300,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+                {
+                    "chr": "chr1",
+                    "type": "exon",
+                    "start": 100,
+                    "end": 200,
+                    "strand": "+",
+                    "gene_id": gene_id,
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": transcript_id,
+                    "transcript_type": "protein_coding",
+                    "length": 100,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+                {
+                    "chr": "chr1",
+                    "type": "exon",
+                    "start": 300,
+                    "end": 400,
+                    "strand": "+",
+                    "gene_id": gene_id,
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": transcript_id,
+                    "transcript_type": "protein_coding",
+                    "length": 100,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+                {
+                    "chr": "chr1",
+                    "type": "CDS",
+                    "start": 120,
+                    "end": 200,
+                    "strand": "+",
+                    "gene_id": gene_id,
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": transcript_id,
+                    "transcript_type": "protein_coding",
+                    "length": 80,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+                {
+                    "chr": "chr1",
+                    "type": "CDS",
+                    "start": 300,
+                    "end": 380,
+                    "strand": "+",
+                    "gene_id": gene_id,
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": transcript_id,
+                    "transcript_type": "protein_coding",
+                    "length": 80,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+                {
+                    "chr": "chr1",
+                    "type": "UTR",
+                    "start": 100,
+                    "end": 120,
+                    "strand": "+",
+                    "gene_id": gene_id,
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": transcript_id,
+                    "transcript_type": "protein_coding",
+                    "length": 20,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+            ]
+        )
+
+    def test_returns_tuple(self) -> None:
+        """Returns a (gene_df, arrow_df) tuple."""
+        features = self._make_gene_features()
+        result = FetchGencode.process_single_gene("ENSG00000001", features)
+        assert result is not None
+        gene_df, arrow_df = result
+        self.assertIsInstance(gene_df, pd.DataFrame)
+        self.assertIsInstance(arrow_df, pd.DataFrame)
+
+    def test_gene_df_has_exon_intron(self) -> None:
+        """Gene DataFrame contains both exon and intron features."""
+        features = self._make_gene_features()
+        result = FetchGencode.process_single_gene("ENSG00000001", features)
+        assert result is not None
+        gene_df, _ = result
+        types = gene_df.type.unique().tolist()
+        self.assertIn("exon", types)
+        self.assertIn("intron", types)
+
+    def test_arrow_df_has_cds_utr(self) -> None:
+        """Arrow DataFrame contains CDS and UTR features."""
+        features = self._make_gene_features()
+        result = FetchGencode.process_single_gene("ENSG00000001", features)
+        assert result is not None
+        _, arrow_df = result
+        types = arrow_df.type.unique().tolist()
+        self.assertIn("CDS", types)
+        self.assertIn("UTR", types)
+
+    def test_returns_none_for_no_protein_coding_transcript(self) -> None:
+        """Returns None when no protein-coding transcript exists."""
+        features = pd.DataFrame(
+            [
+                {
+                    "chr": "chr1",
+                    "type": "transcript",
+                    "start": 100,
+                    "end": 400,
+                    "strand": "+",
+                    "gene_id": "ENSG00000001",
+                    "gene_name": "TESTGENE",
+                    "gene_type": "protein_coding",
+                    "transcript_id": "ENST00000001",
+                    "transcript_type": "lncRNA",
+                    "length": 300,
+                    "ccdsid": float("nan"),
+                    "havana_transcript": float("nan"),
+                },
+            ]
+        )
+        result = FetchGencode.process_single_gene("ENSG00000001", features)
+        self.assertIsNone(result)
+
+    def test_gene_df_columns(self) -> None:
+        """Gene DataFrame contains expected columns."""
+        features = self._make_gene_features()
+        result = FetchGencode.process_single_gene("ENSG00000001", features)
+        assert result is not None
+        gene_df, _ = result
+        for col in ["chr", "start", "end", "type", "gene_id", "gene_name", "transcript_id"]:
+            self.assertIn(col, gene_df.columns)
+
+
+class TestSampleGtfPipeline(unittest.TestCase):
+    """Integration tests running the sample GTF through the full processing pipeline."""
+
+    gencode_raw: pd.DataFrame
+    parsed: pd.DataFrame
+    stripped: pd.DataFrame
+    filtered: pd.DataFrame
+    with_length: pd.DataFrame
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        """Load sample GTF once and run through the full pipeline."""
+        # Load raw GTF exactly like retrieve_data does (comment='#' skips header lines):
+        raw_tsv = pd.read_csv(
+            SAMPLE_GTF, sep="\t", comment="#", header=None, compression="gzip"
+        )
+        cls.gencode_raw = FetchGencode.parse_raw_gtf(raw_tsv)
+        cls.parsed = FetchGencode.parse_gtf_annotations(cls.gencode_raw)
+        cls.stripped = FetchGencode.strip_gene_id_version(cls.parsed)
+        cls.filtered = FetchGencode.filter_protein_coding_genes(cls.stripped)
+        cls.with_length = FetchGencode.add_length_column(cls.filtered)
+
+    # -- parse_raw_gtf ---------------------------------------------------
+
+    def test_raw_shape(self) -> None:
+        """Raw GTF has 64 rows and the 6 expected columns."""
+        self.assertEqual(len(self.gencode_raw), 64)
+        self.assertEqual(
+            list(self.gencode_raw.columns),
+            ["chr", "type", "start", "end", "strand", "annotation"],
+        )
+
+    def test_raw_chromosomes(self) -> None:
+        """All rows are on chr8."""
+        self.assertEqual(self.gencode_raw.chr.unique().tolist(), ["chr8"])
+
+    def test_raw_feature_types(self) -> None:
+        """Raw data contains expected GTF feature types."""
+        types = set(self.gencode_raw.type.unique())
+        for expected in ("gene", "transcript", "exon", "CDS", "UTR"):
+            self.assertIn(expected, types)
+
+    # -- parse_gtf_annotations -------------------------------------------
+
+    def test_parsed_annotation_columns(self) -> None:
+        """Parsed DataFrame has gene_id, gene_name, gene_type columns."""
+        for col in ("gene_id", "gene_name", "gene_type"):
+            self.assertIn(col, self.parsed.columns)
+
+    def test_parsed_annotation_dropped(self) -> None:
+        """Raw annotation column is removed after parsing."""
+        self.assertNotIn("annotation", self.parsed.columns)
+
+    def test_parsed_gene_name(self) -> None:
+        """All rows belong to gene OPRK1."""
+        self.assertEqual(self.parsed.gene_name.unique().tolist(), ["OPRK1"])
+
+    def test_parsed_row_count_unchanged(self) -> None:
+        """Row count is preserved after annotation parsing."""
+        self.assertEqual(len(self.parsed), 64)
+
+    # -- strip_gene_id_version -------------------------------------------
+
+    def test_stripped_gene_id(self) -> None:
+        """Version suffix is removed from gene_id."""
+        self.assertEqual(
+            self.stripped.gene_id.unique().tolist(), ["ENSG00000082556"]
+        )
+
+    # -- filter_protein_coding_genes -------------------------------------
+
+    def test_filter_keeps_conventional_chr(self) -> None:
+        """Rows on chr8 (conventional chromosome) are kept after filtering."""
+        self.assertGreater(len(self.filtered), 0)
+
+    def test_filter_only_protein_coding(self) -> None:
+        """All rows after filtering have gene_type protein_coding."""
+        self.assertTrue((self.filtered.gene_type == "protein_coding").all())
+
+    def test_filter_single_gene(self) -> None:
+        """Only the one gene from the sample survives the filter."""
+        gene_count = len(self.filtered.loc[self.filtered.type == "gene"])
+        self.assertEqual(gene_count, 1)
+
+    # -- add_length_column -----------------------------------------------
+
+    def test_add_length_column_present(self) -> None:
+        """length column is added to the filtered data."""
+        self.assertIn("length", self.with_length.columns)
+
+    def test_add_length_values(self) -> None:
+        """length equals end - start for every row."""
+        self.assertTrue(
+            (self.with_length.length == self.with_length.end - self.with_length.start).all()
+        )
+
+    # -- process_single_gene ---------------------------------------------
+
+    def test_process_single_gene_from_sample(self) -> None:
+        """process_single_gene produces exon/intron and arrow data for OPRK1."""
+        groups = list(self.with_length.groupby(["gene_id"]))
+        self.assertEqual(len(groups), 1)
+
+        (gene_id,), features = groups[0]
+        self.assertEqual(gene_id, "ENSG00000082556")
+
+        result = FetchGencode.process_single_gene(gene_id, features)
+        assert result is not None
+        gene_df, arrow_df = result
+
+        # Gene DataFrame should have exon + intron features:
+        gene_types = set(gene_df.type.unique())
+        self.assertIn("exon", gene_types)
+        self.assertIn("intron", gene_types)
+
+        # Arrow DataFrame should have CDS + UTR features:
+        arrow_types = set(arrow_df.type.unique())
+        self.assertIn("CDS", arrow_types)
+        self.assertIn("UTR", arrow_types)
+
+        # All output rows should reference OPRK1:
+        self.assertTrue((gene_df.gene_name == "OPRK1").all())
+        self.assertTrue((arrow_df.gene_name == "OPRK1").all())
+
+        # Gene id should be consistent:
+        self.assertTrue((gene_df.gene_id == "ENSG00000082556").all())
+        self.assertTrue((arrow_df.gene_id == "ENSG00000082556").all())
+
+    def test_canonical_transcript_selection(self) -> None:
+        """Canonical transcript for OPRK1 is selected from CCDS transcripts.
+
+        OPRK1 has 3 protein_coding transcripts with CCDS IDs. The one with
+        the longest CDS should be selected as canonical.
+        """
+        (gene_id,), features = list(self.with_length.groupby(["gene_id"]))[0]
+        result = FetchGencode.process_single_gene(gene_id, features)
+        assert result is not None
+        gene_df, _ = result
+
+        # The canonical transcript should be one of the known OPRK1 transcripts:
+        transcript_id = gene_df.transcript_id.unique().tolist()
+        self.assertEqual(len(transcript_id), 1)
+        self.assertIn(
+            transcript_id[0],
+            [
+                "ENST00000265572.8",
+                "ENST00000673285.2",
+                "ENST00000524278.5",
+                "ENST00000520287.5",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From a11c728901970603853dc83492626e5e5390f615 Mon Sep 17 00:00:00 2001
From: Daniel Suveges <daniel.suveges@protonmail.com>
Date: Tue, 10 Feb 2026 10:02:03 +0000
Subject: [PATCH 2/4] fix: minor updates

---
 README.md                                     | 33 ++++---
 config.json                                   |  2 +-
 .../functions/ChromosomePlotter.py            | 11 ++-
 .../functions/CytobandAnnotator.py            |  4 +
 src/genome_plotter/functions/svg_handler.py   |  4 +-
 .../input_parsers/fetch_ensembl.py            | 18 ++--
 .../input_parsers/fetch_gwas_catalog.py       |  1 +
 src/genome_plotter/prepare_data.py            | 96 +++++++++++--------
 8 files changed, 97 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index 7f6a811..e70ef91 100644
--- a/README.md
+++ b/README.md
@@ -34,23 +34,23 @@ uv sync --all-extras
 
 As input, the most recent datasets are pulled from the respected sources as part of the data preparation process. All source data are mapped to the GRCh38 build of the human genome.
 
-* **The sequence of the human genome** is dowloaded from [Ensembl](http://www.ensembl.org/info/data/ftp/index.html) (checking for the most recent version).
-* **Genome wide association signals** most recent version of the NHGRI-EBI [GWAS catalog](https://www.ebi.ac.uk/gwas/) (checking the most recent version).
-* **Gene annotation** the most recent gene coordinates are downloaded from [GENCODE](http://www.gencodegenes.org/releases/current.html).
-* **Canonical transcripts** of protein coding genes are defined according to [Ensembl](https://www.ensembl.org/Help/Glossary).
-* **Cytological bands** coordinates fetched from the Ensembl [REST API](http://rest.ensembl.org/).
+- **The sequence of the human genome** is dowloaded from [Ensembl](http://www.ensembl.org/info/data/ftp/index.html) (checking for the most recent version).
+- **Genome wide association signals** most recent version of the NHGRI-EBI [GWAS catalog](https://www.ebi.ac.uk/gwas/) (checking the most recent version).
+- **Gene annotation** the most recent gene coordinates are downloaded from [GENCODE](http://www.gencodegenes.org/releases/current.html).
+- **Canonical transcripts** of protein coding genes are defined according to [Ensembl](https://www.ensembl.org/Help/Glossary).
+- **Cytological bands** coordinates fetched from the Ensembl [REST API](http://rest.ensembl.org/).
 
 More information on the sources can be found in the `config.json` configuration file.
 
 ### Step 1 - Pre-processing
 
 ```bash
-python Prepare_data.py -d data_folder/ -c config.json -s 450 -t 0.5
+uv run Prepare-data -d data_folder/ -c config.json -s 450 -t 0.5
 ```
 
 help output:
 
-```
+```text
 usage: prepare_data.py [-h] -d DATADIR -c CONFIG -s CHUNKSIZE -t TOLERANCE
 
 This script fetches and parses input data for the genome plotter project
@@ -67,20 +67,20 @@ optional arguments:
                         Fraction of a chunk that cannot be N.
 ```
 
-* *<DATADIR>* folder into which the files are going to be saved.
-* *<CONFIG>* JSON file containing the project level configuration. Will be used for multiple scripts
-* *<LOGFILE>* information on the run is saved here.
-* *<CHUNKSIZE>* the length of non-overlapping window used to pool together to calculate [GC content](https://en.wikipedia.org/wiki/GC-content). In basepairs.
-* *<TOLERANCE>* Ns are discarded from the GC content calculation. This float (ranging from 0-1) shows the maximum of Ns in a chunk tolerated. Chunks with too high N ratio is considered as heterochromatic region on the plot.
+- **DATADIR** folder into which the files are going to be saved.
+- **CONFIG** JSON file containing the project level configuration. Will be used for multiple scripts
+- **LOGFILE** information on the run is saved here.
+- **CHUNKSIZE** the length of non-overlapping window used to pool together to calculate [GC content](https://en.wikipedia.org/wiki/GC-content). In basepairs.
+- **TOLERANCE** Ns are discarded from the GC content calculation. This float (ranging from 0-1) shows the maximum of Ns in a chunk tolerated. Chunks with too high N ratio is considered as heterochromatic region on the plot.
 
 
 ### Step 2 - Generate chromosome plot
 
 ```bash
-./plot_chromosome.py --help
+uv run plot-chromosome --help
 ```
 
-```
+```text
 usage: plot_chromosome.py [-h] -c CHROMOSOME [-w WIDTH] [-p PIXEL] [-s DARKSTART] [-m DARKMAX] -f FOLDER [--textFile]
                           [-g GENEFILE] [-t TEST] [--dummy] --config CONFIG [-l LOGFILE]
 
@@ -125,8 +125,8 @@ Finally the `.png` file is saved (and `.svg` file if required).
 
 The `gene_sets/` folder contains a set of files that can be used as gene annotation:
 
-* *kinases_Hs.bed.gz*: list of kinase genes in the human genome, generated by `get_kinases.sh` script.
-* *gene_w_drugs.tsv.gz*: list of genes for which approved drugs exists (where the mechanism of action is known) based on [OpenTargets tractability](https://docs.targetvalidation.org/getting-started/target-tractability) data.
+- *kinases_Hs.bed.gz*: list of kinase genes in the human genome, generated by `get_kinases.sh` script.
+- *gene_w_drugs.tsv.gz*: list of genes for which approved drugs exists (where the mechanism of action is known) based on [OpenTargets tractability](https://docs.targetvalidation.org/getting-started/target-tractability) data.
 
 ### Result
 
@@ -134,7 +134,6 @@ The following image was created based on the data of chromosome 20, where 450 bp
 
 <img src="plots/chr20.png" alt="Chromosome 20" height="1500"/>
 
-
 The combined plot showing the entire genome with all the protein kinases highlighted:
 
 <img src="plots/kinases.png" alt="Protein kinases in the genome" width="1000">
diff --git a/config.json b/config.json
index 000d42e..c827402 100644
--- a/config.json
+++ b/config.json
@@ -66,7 +66,7 @@
     "gwas_data": {
       "host": "ftp.ebi.ac.uk",
       "path": "/pub/databases/gwas/releases/latest",
-      "source_file": "gwas-catalog-associations_ontology-annotated.tsv",
+      "source_file": "gwas-catalog-associations_ontology-annotated-full.zip",
       "processed_file": "processed_GWAS.bed.gz",
       "release_date": null
     }
diff --git a/src/genome_plotter/functions/ChromosomePlotter.py b/src/genome_plotter/functions/ChromosomePlotter.py
index 6e12577..3b7c390 100644
--- a/src/genome_plotter/functions/ChromosomePlotter.py
+++ b/src/genome_plotter/functions/ChromosomePlotter.py
@@ -3,6 +3,9 @@
 from __future__ import annotations
 
 import logging
+import os
+
+os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = "/opt/homebrew/lib"
 
 import cairosvg
 import pandas as pd
@@ -62,14 +65,16 @@ def __add_centromere(self: ChromosomePlotter) -> None:
 
         # Right mark of the centromere:
         centromere_string = f'<path d="M -1 0 \
-            C 0 {centromere_midpoint}, {cetromere_x/2} {centromere_midpoint}, {cetromere_x/2} {centromere_midpoint} \
-            C {cetromere_x/2} {centromere_midpoint}, 0 {centromere_midpoint}, -1 {centromere_hight} Z" fill="white"/>\n'
+            C 0 {centromere_midpoint}, {cetromere_x / 2} {centromere_midpoint}, {cetromere_x / 2} {centromere_midpoint} \
+            C {cetromere_x / 2} {centromere_midpoint}, 0 {centromere_midpoint}, -1 {centromere_hight} Z" fill="white"/>\n'
 
         half_centromere = f'<g transform="translate(0, {centromere_start})">\n\t{centromere_string}\n</g>\n'
 
         # Generating the other half of the centromoere:
-        other_half = f'<g transform="rotate(180 0 {centromere_start + centromere_midpoint}) \
+        other_half = (
+            f'<g transform="rotate(180 0 {centromere_start + centromere_midpoint}) \
             translate(-{self.__width__}, 0)">\n\t{half_centromere}\n</g>\n'
+        )
 
         # adding both sides of the centromere to the plot:
         self.__plot_string__ += (
diff --git a/src/genome_plotter/functions/CytobandAnnotator.py b/src/genome_plotter/functions/CytobandAnnotator.py
index a5d1b97..bac7fb4 100644
--- a/src/genome_plotter/functions/CytobandAnnotator.py
+++ b/src/genome_plotter/functions/CytobandAnnotator.py
@@ -2,7 +2,11 @@
 
 from __future__ import annotations
 
+import os
+
 import cairosvg
+
+os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = "/opt/homebrew/lib"
 import pandas as pd
 
 
diff --git a/src/genome_plotter/functions/svg_handler.py b/src/genome_plotter/functions/svg_handler.py
index e0611ab..cb663fb 100644
--- a/src/genome_plotter/functions/svg_handler.py
+++ b/src/genome_plotter/functions/svg_handler.py
@@ -2,8 +2,10 @@
 
 from __future__ import annotations
 
+import os
 from typing import Any
 
+os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = "/opt/homebrew/lib"
 import cairosvg
 
 
@@ -177,7 +179,7 @@ def draw_line(
 
         if kwargs:
             for key, value in kwargs.items():
-                extra_args += f' {key.replace("_","-")}="{value}"'
+                extra_args += f' {key.replace("_", "-")}="{value}"'
         print(extra_args)
         self.__svg__ += self.__svg_line__.format(
             x1, y1, x2, y2, stroke, stroke_width, extra_args
diff --git a/src/genome_plotter/input_parsers/fetch_ensembl.py b/src/genome_plotter/input_parsers/fetch_ensembl.py
index d8364a9..775f743 100644
--- a/src/genome_plotter/input_parsers/fetch_ensembl.py
+++ b/src/genome_plotter/input_parsers/fetch_ensembl.py
@@ -44,9 +44,9 @@ def __init__(self: FetchGenome, ensembl_parameters: SourcePrototype) -> None:
         Args:
             ensembl_parameters (SourcePrototype): Parameters to fetch the data.
         """
-        assert (
-            ensembl_parameters.release and ensembl_parameters.path
-        ), "Ensembl release and path needs to be provided."
+        assert ensembl_parameters.release and ensembl_parameters.path, (
+            "Ensembl release and path needs to be provided."
+        )
 
         # These are the parameters required to fetch the data:
         self.host = ensembl_parameters.host
@@ -93,11 +93,7 @@ def parse_genome(
                 # If there's data in the buffer, save it:
                 if chrom_data:
                     # We are skipping non-canonical chromosomes:
-                    if chrom_name and len(chrom_name) < 3:
-                        logger.info(f"Parsing chromosome {chrom_name} is done.")
-                        self.process_chromosome(chrom_data, chrom_name)
-                    else:
-                        logger.info(f"Chromosome {chrom_name} is skipped.")
+                    self.process_chromosome(chrom_data, chrom_name)
 
                     # Empty chromosome data buffer:
                     chrom_data = ""
@@ -129,6 +125,12 @@ def process_chromosome(
             chrom_data (pd.DataFrame): The chromosome sequence data.
             chr_name (str): The name of the chromosome.
         """
+        # Test chromosome name if we want to process or not:
+        if chr_name is None or len(chr_name) > 3:
+            logger.warning(f"Non-canoncial chromosome ({chr_name}) is skipped.")
+            return None
+
+        logger.info(f"Processing chromosome: {chr_name}")
         raw_data = []
         file_name = f"{self.data_folder}/{self.parsed_file.format(chr_name)}"
         chunk_size = self.chunk_size
diff --git a/src/genome_plotter/input_parsers/fetch_gwas_catalog.py b/src/genome_plotter/input_parsers/fetch_gwas_catalog.py
index 849712a..676c94e 100644
--- a/src/genome_plotter/input_parsers/fetch_gwas_catalog.py
+++ b/src/genome_plotter/input_parsers/fetch_gwas_catalog.py
@@ -43,6 +43,7 @@ def retrieve_data(self: FetchGwas) -> None:
         self.release_date = self.fetch_last_update_date(self.path)
 
         # Parse data
+        logger.info(f"{self.path} -> {self.source_file}")
         self.fetch_tsv(self.path, self.source_file)
 
         logger.info(f"Successfully fetched {len(self.tsv_data):,} GWAS associations.")
diff --git a/src/genome_plotter/prepare_data.py b/src/genome_plotter/prepare_data.py
index 3f89734..9661f38 100644
--- a/src/genome_plotter/prepare_data.py
+++ b/src/genome_plotter/prepare_data.py
@@ -10,8 +10,14 @@
 import yaml
 
 from genome_plotter.functions.ConfigManager import Config
+from genome_plotter.input_parsers.data_integrator import integrate_data
 from genome_plotter.input_parsers.fetch_cytobands import FetchCytobands
+from genome_plotter.input_parsers.fetch_ensembl import (
+    FetchGenome,
+    fetch_ensembl_version,
+)
 from genome_plotter.input_parsers.fetch_gencode import FetchGencode
+from genome_plotter.input_parsers.fetch_gwas_catalog import FetchGwas
 
 logger = logging.getLogger(__name__)
 
@@ -77,6 +83,9 @@ def run(configuration: Config) -> None:
 
     Args:
         configuration (Config): The configuration object containing the input data.
+
+    Raises:
+        ValueError: if Could not find ensembl version.
     """
     # Extracting relevant parameters:
     basic_parameters = configuration.basic_parameters
@@ -92,20 +101,20 @@ def run(configuration: Config) -> None:
     logger.info(f"Chunk size: {chunk_size}")
     logger.info(f"Tolerance for unsequenced bases: {tolerance}")
 
-    # # Fetching GWAS Catalog data:
-    # logger.info("Fetching GWAS data...")
-    # gwas_retrieve = FetchGwas(configuration.source_data.gwas_data)
-    # gwas_retrieve.retrieve_data()
-    # gwas_retrieve.process_gwas_data()
-    # gwas_retrieve.save_gwas_data(data_dir)
-    # configuration.source_data.gwas_data.release_date = gwas_retrieve.get_release_date()
-
-    # # Fetching cytological bands:
-    # logger.info("Fetching cytoband information...")
-    # configuration.source_data.cytoband_data.genome_build = get_cytoband_data(
-    #     configuration.source_data.cytoband_data.url,
-    #     f"{data_dir}/{configuration.source_data.cytoband_data.processed_file}",
-    # )
+    # Fetching GWAS Catalog data:
+    logger.info("Fetching GWAS data...")
+    gwas_retrieve = FetchGwas(configuration.source_data.gwas_data)
+    gwas_retrieve.retrieve_data()
+    gwas_retrieve.process_gwas_data()
+    gwas_retrieve.save_gwas_data(data_dir)
+    configuration.source_data.gwas_data.release_date = gwas_retrieve.get_release_date()
+
+    # Fetching cytological bands:
+    logger.info("Fetching cytoband information...")
+    configuration.source_data.cytoband_data.genome_build = get_cytoband_data(
+        configuration.source_data.cytoband_data.url,
+        f"{data_dir}/{configuration.source_data.cytoband_data.processed_file}",
+    )
 
     # Fetching GENCODE data:
     logging.info("Fetching GENCODE data.")
@@ -118,34 +127,37 @@ def run(configuration: Config) -> None:
     )
     configuration.source_data.gencode_data.version = gencode_retrieve.get_release()
 
-    # # Fetching Ensembl version and genome build:
-    # logger.info("Fetching Ensembl release...")
-    # ensembl_release = fetch_ensembl_version(
-    #     configuration.source_data.ensembl_data.version_url
-    # )
-    # configuration.source_data.ensembl_data.release = ensembl_release
-    # logger.info(f"Current Ensembl release: {ensembl_release}")
-
-    # # Fetching the human genome:
-    # logger.info("Fetching the human genome sequence...")
-    # genome_retrieve = FetchGenome(configuration.source_data.ensembl_data)
-    # genome_retrieve.retrieve_data()
-    # genome_retrieve.parse_genome(chunk_size, tolerance, data_dir)
-
-    # # Integrate parsed data into one single table:
-    # logger.info("Integrating parsed data...")
-    # integrate_data(
-    #     output_dir=data_dir,
-    #     # chromosomes=genome_retrieve.chromosomes,
-    #     chromosomes=["19", "21", "13"],
-    #     cytoband_file=configuration.get_cytoband_file(),
-    #     gencode_file=configuration.get_gencode_file(),
-    # )
-
-    # # Save config file:
-    # updated_config_file = "config_updated.json"
-    # logger.info(f"Saving updated configuration as {updated_config_file}.")
-    # configuration.save(updated_config_file)
+    # Fetching Ensembl version and genome build:
+    logger.info("Fetching Ensembl release...")
+    if configuration.source_data.ensembl_data.version_url is None:
+        raise ValueError("Could not pull Esembl version.")
+
+    ensembl_release = fetch_ensembl_version(
+        configuration.source_data.ensembl_data.version_url
+    )
+    configuration.source_data.ensembl_data.release = ensembl_release
+    logger.info(f"Current Ensembl release: {ensembl_release}")
+
+    # Fetching the human genome:
+    logger.info("Fetching the human genome sequence...")
+    genome_retrieve = FetchGenome(configuration.source_data.ensembl_data)
+    genome_retrieve.retrieve_data()
+    genome_retrieve.parse_genome(chunk_size, tolerance, data_dir)
+
+    # Integrate parsed data into one single table:
+    logger.info("Integrating parsed data...")
+    integrate_data(
+        output_dir=data_dir,
+        # chromosomes=genome_retrieve.chromosomes,
+        chromosomes=[str(i + 1) for i in range(22)] + ["Y", "X", "MT"],
+        cytoband_file=configuration.get_cytoband_file(),
+        gencode_file=configuration.get_gencode_file(),
+    )
+
+    # Save config file:
+    updated_config_file = "config_updated.json"
+    logger.info(f"Saving updated configuration as {updated_config_file}.")
+    configuration.save(updated_config_file)
 
 
 def validate_input(data_dir: str, config_file: str) -> None:

From d2ce804161858b86c31321ea481a62417454aea7 Mon Sep 17 00:00:00 2001
From: Daniel Suveges <daniel.suveges@protonmail.com>
Date: Tue, 10 Feb 2026 10:19:26 +0000
Subject: [PATCH 3/4] making congig optional. If not provided the default
 configuration is used

---
 config.json => src/genome_plotter/assets/config.json |  0
 src/genome_plotter/prepare_data.py                   | 10 ++++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)
 rename config.json => src/genome_plotter/assets/config.json (100%)

diff --git a/config.json b/src/genome_plotter/assets/config.json
similarity index 100%
rename from config.json
rename to src/genome_plotter/assets/config.json
diff --git a/src/genome_plotter/prepare_data.py b/src/genome_plotter/prepare_data.py
index 9661f38..5256aea 100644
--- a/src/genome_plotter/prepare_data.py
+++ b/src/genome_plotter/prepare_data.py
@@ -41,8 +41,9 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "-c",
         "--config",
-        help="JSON file with configuration data",
-        required=True,
+        help="JSON file with configuration data. If not provided, the default config bundled with the package is used.",
+        required=False,
+        default=None,
         type=str,
     )
     parser.add_argument(
@@ -192,6 +193,11 @@ def main() -> None:
     logging.config.dictConfig(logger_config)
     logger = logging.getLogger(__name__)
 
+    # Resolve config file: use bundled default if not provided:
+    if args.config is None:
+        args.config = os.path.join(os.path.dirname(__file__), "assets", "config.json")
+        logger.info("No config file provided, using default bundled config.")
+
     # Validate input parameters:
     logger.info("Validating input parameters...")
     validate_input(args.dataDir, args.config)

From 221c12ba8ffec707b5e7913584a799c3dc47b865 Mon Sep 17 00:00:00 2001
From: Daniel Suveges <daniel.suveges@protonmail.com>
Date: Tue, 10 Feb 2026 10:21:33 +0000
Subject: [PATCH 4/4] update tests

---
 src/tests/test_config_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/test_config_manager.py b/src/tests/test_config_manager.py
index 58fcc86..039780e 100644
--- a/src/tests/test_config_manager.py
+++ b/src/tests/test_config_manager.py
@@ -14,7 +14,7 @@ class TestConfigManager(unittest.TestCase):
     """Test cases for Config manager."""
 
     CONFIG_JSON = os.path.join(
-        os.path.dirname(__file__), "..", "genome_plotter", "config.json"
+        os.path.dirname(__file__), "..", "genome_plotter", "assets", "config.json"
     )
 
     with open(CONFIG_JSON) as f: