diff --git a/Examples/qspace_lanczos/compute_spectral_Xpoint.py b/Examples/qspace_lanczos/compute_spectral_Xpoint.py
new file mode 100644
index 00000000..c46ad7dc
--- /dev/null
+++ b/Examples/qspace_lanczos/compute_spectral_Xpoint.py
@@ -0,0 +1,195 @@
+"""
+Q-Space Lanczos: phonon spectral function at the X point of SnTe
+================================================================
+
+This example computes the anharmonic phonon spectral function for SnTe
+at the X point (zone boundary) using the Q-space Lanczos algorithm.
+
+The Q-space Lanczos exploits Bloch momentum conservation to drastically
+reduce the size of the two-phonon sector, giving a speedup proportional
+to the number of unit cells in the supercell (N_cell = 8 for this 2x2x2
+supercell).
+
+Requirements:
+    - Julia with SparseArrays package
+    - spglib
+    - The SnTe ensemble from tests/test_julia/data/
+
+Usage:
+    python compute_spectral_Xpoint.py
+
+    # Or with MPI parallelism:
+    mpirun -np 4 python compute_spectral_Xpoint.py
+"""
+from __future__ import print_function
+
+import numpy as np
+import os, sys
+
+import cellconstructor as CC
+import cellconstructor.Phonons
+import cellconstructor.Units
+
+import sscha, sscha.Ensemble
+import tdscha.QSpaceLanczos as QL
+
+from tdscha.Parallel import pprint as print
+
+# ========================
+# Parameters
+# ========================
+# Path to the SnTe ensemble data (2x2x2 supercell, 2 atoms/unit cell)
+DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                        '..', '..', 'tests', 'test_julia', 'data')
+NQIRR = 3           # Number of irreducible q-points in the dynamical matrix
+TEMPERATURE = 250    # Temperature in Kelvin
+N_STEPS = 50         # Number of Lanczos steps (increase for production)
+SAVE_DIR = "output"  # Directory for checkpoints and results
+
+
+def main():
+    # ========================
+    # 1. Load ensemble
+    # ========================
+    dyn = CC.Phonons.Phonons(os.path.join(DATA_DIR, "dyn_gen_pop1_"), NQIRR)
+    ens = sscha.Ensemble.Ensemble(dyn, TEMPERATURE)
+    ens.load_bin(DATA_DIR, 1)
+
+    # ========================
+    # 2. Create Q-space Lanczos
+    # ========================
+    qlanc = QL.QSpaceLanczos(ens)
+    qlanc.ignore_v3 = False   # Include 3-phonon interactions
+    qlanc.ignore_v4 = False   # Include 4-phonon interactions
+    qlanc.init(use_symmetries=True)
+
+    # ========================
+    # 3. Inspect q-points and pick the X point
+    # ========================
+    print("Available q-points:")
+    for iq, q in enumerate(qlanc.q_points):
+        freqs = qlanc.w_q[:, iq] * CC.Units.RY_TO_CM
+        print("  iq={}: q = ({:8.5f}, {:8.5f}, {:8.5f})  "
+              "freqs = {} cm-1".format(iq, q[0], q[1], q[2],
+              np.array2string(freqs, precision=1, separator=', ')))
+
+    # The zone-boundary X point in a 2x2x2 FCC supercell is at iq=5,6,7
+    # (equivalent by cubic symmetry). We pick iq=5.
+    iq_pert = 5
+    print()
+    print("Selected q-point: iq={}".format(iq_pert))
+    print("  q = {}".format(qlanc.q_points[iq_pert]))
+    print()
+
+    # ========================
+    # 4. Run Lanczos for each band at the X point
+    # ========================
+    n_bands = qlanc.n_bands  # 6 bands for SnTe (3 * 2 atoms)
+
+    for band in range(n_bands):
+        freq = qlanc.w_q[band, iq_pert] * CC.Units.RY_TO_CM
+
+        # Skip acoustic modes (zero frequency at Gamma only; at X all modes
+        # have finite frequency, but we check anyway for safety)
+        if qlanc.w_q[band, iq_pert] < 1e-6:
+            print("Skipping acoustic band {} (freq = {:.2f} cm-1)".format(
+                band, freq))
+            continue
+
+        print("=" * 50)
+        print("Band {}: {:.2f} cm-1".format(band, freq))
+        print("=" * 50)
+
+        qlanc.prepare_mode_q(iq_pert, band)
+        qlanc.run_FT(N_STEPS, save_each=10, save_dir=SAVE_DIR,
+                      prefix="Xpoint_band{}".format(band), verbose=True)
+        qlanc.save_status(os.path.join(SAVE_DIR,
+                          "Xpoint_band{}_final.npz".format(band)))
+
+    # ========================
+    # 5. Plot the total spectral function at X
+    # ========================
+    print()
+    print("=" * 50)
+    print("Computing total spectral function at X point")
+    print("=" * 50)
+
+    # Frequency grid (cm-1)
+    w_cm = np.linspace(0, 200, 500)
+    w_ry = w_cm / CC.Units.RY_TO_CM
+    smearing = 3.0 / CC.Units.RY_TO_CM  # 3 cm-1 broadening
+
+    total_spectral = np.zeros_like(w_cm)
+
+    for band in range(n_bands):
+        if qlanc.w_q[band, iq_pert] < 1e-6:
+            continue
+
+        result_file = os.path.join(SAVE_DIR,
+                                   "Xpoint_band{}_final.npz".format(band))
+        if not os.path.exists(result_file):
+            print("  Band {} result not found, skipping".format(band))
+            continue
+
+        # Load and compute Green function
+        from tdscha.DynamicalLanczos import Lanczos
+        lanc_tmp = Lanczos()
+        lanc_tmp.load_status(result_file)
+
+        gf = lanc_tmp.get_green_function_continued_fraction(
+            w_ry, smearing=smearing, use_terminator=True)
+        spectral = -np.imag(gf)
+        total_spectral += spectral
+
+        # Print the renormalized frequency from G(0)
+        gf0 = lanc_tmp.get_green_function_continued_fraction(
+            np.array([0.0]), smearing=smearing, use_terminator=True)
+        w2 = 1.0 / np.real(gf0[0])
+        w_ren = np.sign(w2) * np.sqrt(np.abs(w2)) * CC.Units.RY_TO_CM
+        print("  Band {}: harmonic = {:.2f} cm-1, "
+              "renormalized = {:.2f} cm-1".format(
+                  band,
+                  qlanc.w_q[band, iq_pert] * CC.Units.RY_TO_CM,
+                  w_ren))
+
+    # Save the spectrum to a text file
+    output_file = os.path.join(SAVE_DIR, "spectral_Xpoint.dat")
+    np.savetxt(output_file,
+               np.column_stack([w_cm, total_spectral]),
+               header="omega(cm-1)  spectral_function(arb.units)",
+               fmt="%.6f")
+    print()
+    print("Spectral function saved to {}".format(output_file))
+
+    # Plot if matplotlib available
+    try:
+        import matplotlib
+        matplotlib.use('Agg')
+        import matplotlib.pyplot as plt
+
+        fig, ax = plt.subplots(figsize=(8, 5))
+        ax.plot(w_cm, total_spectral, 'b-', linewidth=1.5)
+
+        # Mark harmonic frequencies
+        for band in range(n_bands):
+            freq = qlanc.w_q[band, iq_pert] * CC.Units.RY_TO_CM
+            if freq > 1e-3:
+                ax.axvline(freq, color='r', linestyle='--', alpha=0.5,
+                           linewidth=0.8)
+
+        ax.set_xlabel("Frequency (cm$^{-1}$)")
+        ax.set_ylabel("Spectral function (arb. units)")
+        ax.set_title("SnTe phonon spectral function at X point (T = {} K)".format(
+            TEMPERATURE))
+        ax.set_xlim(0, 200)
+        ax.legend(["Anharmonic (TD-SCHA)", "Harmonic frequencies"],
+                  loc="upper right")
+        fig.tight_layout()
+        fig.savefig(os.path.join(SAVE_DIR, "spectral_Xpoint.pdf"))
+        print("Plot saved to {}/spectral_Xpoint.pdf".format(SAVE_DIR))
+    except ImportError:
+        print("matplotlib not available, skipping plot")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Modules/DynamicalLanczos.py b/Modules/DynamicalLanczos.py
index d5f45f6c..c80718d1 100644
--- a/Modules/DynamicalLanczos.py
+++ b/Modules/DynamicalLanczos.py
@@ -300,6 +300,7 @@ def __init__(self, ensemble = None, mode = None, unwrap_symmetries = False, sele
         self.gamma_only = False
         self.trans_projector = None    # (n_modes, n_modes) matrix P = (1/n_cells) Σ_R T_R^mode
         self.trans_operators = None    # list of (n_modes, n_modes) T_R^mode matrices
+        self.trans_cart_perms = None   # list of Cartesian permutation index arrays for fast projection
 
         # Set to True if we want to use the Wigner equations
         self.use_wigner = use_wigner
@@ -764,15 +765,21 @@ def prepare_symmetrization(self, no_sym = False, verbose = True, symmetries = No
                     len(pg_symmetries), n_total_syms, n_total_syms // max(len(pg_symmetries), 1)))
 
             # Build translation operators in mode space: T_R^mode = pols^T @ P_R @ pols
+            # Also store Cartesian permutation index arrays for fast projection
             self.trans_operators = []
+            self.trans_cart_perms = []
             nat_sc = super_structure.N_atoms
             for t_sym in translations:
                 irt = CC.symmetries.GetIRT(super_structure, t_sym)
                 # Build Cartesian permutation matrix P_R (3*nat_sc x 3*nat_sc)
                 P_R = np.zeros((3 * nat_sc, 3 * nat_sc), dtype=np.double)
+                # Build inverse permutation index array for fast O(n^2) projection
+                inv_perm = np.zeros(3 * nat_sc, dtype=np.intp)
                 for i_at in range(nat_sc):
                     j_at = irt[i_at]
                     P_R[3*j_at:3*j_at+3, 3*i_at:3*i_at+3] = np.eye(3)
+                    inv_perm[3*j_at:3*j_at+3] = [3*i_at, 3*i_at+1, 3*i_at+2]
+                self.trans_cart_perms.append(inv_perm)
                 # Project into mode space
                 T_mode = self.pols.T @ P_R @ self.pols  # (n_modes, n_modes)
                 self.trans_operators.append(T_mode)
@@ -3171,12 +3178,16 @@ def get_combined_proc(start_end):
             # Project f_pert_av: P_trans @ f
             f_pert_av = self.trans_projector @ f_pert_av
 
-            # Project d2v_pert_av: (1/n_cells) Σ_R T_R @ d2v @ T_R^T
-            n_cells = len(self.trans_operators)
-            d2v_proj = np.zeros_like(d2v_pert_av)
-            for T_R in self.trans_operators:
-                d2v_proj += T_R @ d2v_pert_av @ T_R.T
-            d2v_pert_av = d2v_proj / n_cells
+            # Project d2v_pert_av using Cartesian-space permutations (O(n^2) per translation)
+            # Math: (1/N) Σ_R T_R @ d2v @ T_R^T = pols^T @ [(1/N) Σ_R P_R @ (pols @ d2v @ pols^T) @ P_R^T] @ pols
+            # where P_R @ M @ P_R^T is just row/column permutation (fancy indexing)
+            n_cells = len(self.trans_cart_perms)
+            d2v_cart = self.pols @ d2v_pert_av @ self.pols.T
+            d2v_cart_avg = np.zeros_like(d2v_cart)
+            for inv_perm in self.trans_cart_perms:
+                d2v_cart_avg += d2v_cart[np.ix_(inv_perm, inv_perm)]
+            d2v_cart_avg /= n_cells
+            d2v_pert_av = self.pols.T @ d2v_cart_avg @ self.pols
             _t_trans_end = time.time()
             if self.verbose:
                 print("Time for translational projection: {:.6f} s".format(_t_trans_end - _t_trans_start))
diff --git a/Modules/QSpaceLanczos.py b/Modules/QSpaceLanczos.py
new file mode 100644
index 00000000..3f3d5f80
--- /dev/null
+++ b/Modules/QSpaceLanczos.py
@@ -0,0 +1,1267 @@
+"""
+Q-Space Lanczos Module
+======================
+
+This module implements the Lanczos algorithm in q-space (Bloch basis) to exploit
+momentum conservation and block structure from Bloch's theorem. This gives a
+speedup of ~N_cell over the real-space implementation.
+
+Key differences from the real-space DynamicalLanczos.Lanczos:
+- Psi vector is complex128 (Hermitian Lanczos with sesquilinear inner product)
+- Two-phonon sector uses (q1, q2) pairs constrained by q1+q2 = q_pert + G
+- Symmetries are point-group only (translations handled by Fourier transform)
+- Requires Julia extension (tdscha_qspace.jl)
+
+References:
+    Implementation plan: implementation_plan.md
+    Parent class: DynamicalLanczos.py
+"""
+
+from __future__ import print_function
+from __future__ import division
+
+import sys, os
+import time
+import warnings
+import numpy as np
+
+import cellconstructor as CC
+import cellconstructor.Phonons
+import cellconstructor.symmetries
+import cellconstructor.Methods
+
+import tdscha.DynamicalLanczos as DL
+import cellconstructor.Settings as Parallel
+from cellconstructor.Settings import ParallelPrint as print
+
+# Try to import the julia module
+__JULIA_EXT__ = False
+try:
+    import julia, julia.Main
+    julia.Main.include(os.path.join(os.path.dirname(__file__), 
+        "tdscha_qspace.jl"))
+    __JULIA_EXT__ = True
+except:
+    try:
+        import julia
+        from julia.api import Julia
+        jl = Julia(compiled_modules=False)
+        import julia.Main
+        try:
+            julia.Main.include(os.path.join(os.path.dirname(__file__),
+                "tdscha_qspace.jl"))
+            __JULIA_EXT__ = True
+        except:
+            # Install the required modules
+            julia.Main.eval("""
+using Pkg
+Pkg.add("SparseArrays")
+Pkg.add("InteractiveUtils")
+""")
+            try:
+                julia.Main.include(os.path.join(os.path.dirname(__file__),
+                    "tdscha_qspace.jl"))
+                __JULIA_EXT__ = True
+            except Exception as e:
+                warnings.warn("Julia extension not available.\nError: {}".format(e))
+    except Exception as e:
+        warnings.warn("Julia extension not available.\nError: {}".format(e))
+    pass
+
+try:
+    import spglib
+    __SPGLIB__ = True
+except ImportError:
+    __SPGLIB__ = False
+
+
+# Constants
+__EPSILON__ = 1e-12
+__RyToK__ = 157887.32400374097
+TYPE_DP = np.double
+
+
+def find_q_index(q_target, q_points, bg, tol=1e-6):
+    """Find the index of q_target in q_points up to a reciprocal lattice vector.
+
+    Parameters
+    ----------
+    q_target : ndarray(3,)
+        The q-point to find (Cartesian coordinates).
+    q_points : ndarray(n_q, 3)
+        Array of q-points.
+    bg : ndarray(3, 3)
+        Reciprocal lattice vectors / (2*pi), rows are vectors.
+    tol : float
+        Tolerance for matching.
+
+    Returns
+    -------
+    int
+        Index of the matching q-point.
+    """
+    for iq, q in enumerate(q_points):
+        dist = CC.Methods.get_min_dist_into_cell(bg, q_target, q)
+        if dist < tol:
+            return iq
+    raise ValueError("Could not find q-point {} in the q-point list".format(q_target))
+
+
+class QSpaceLanczos(DL.Lanczos):
+    """Q-space Lanczos for spectral calculations exploiting Bloch's theorem.
+
+    This class works in the q-space mode basis to exploit momentum conservation,
+    reducing the psi vector size by ~N_cell and the anharmonic computation by ~N_cell.
+
+    Only Wigner formalism is supported. Requires Julia extension.
+    """
+
+    def __init__(self, ensemble, **kwargs):
+        """Initialize the Q-Space Lanczos.
+
+        Parameters
+        ----------
+        ensemble : sscha.Ensemble.Ensemble
+            The SSCHA ensemble.
+        **kwargs
+            Additional keyword arguments passed to the parent Lanczos class.
+        """
+        # Force Wigner and Julia mode
+        kwargs['mode'] = DL.MODE_FAST_JULIA
+        kwargs['use_wigner'] = True
+
+        self.ensemble = ensemble
+        super().__init__(ensemble, unwrap_symmetries=False, **kwargs)
+
+        if not __JULIA_EXT__:
+            raise ImportError(
+                "QSpaceLanczos requires Julia. Install with: pip install julia"
+            )
+        self.use_wigner = True
+
+        # -- Add the q-space attributes --
+        qspace_attrs = [
+            'q_points', 'n_q', 'n_bands', 'w_q', 'pols_q',
+            'acoustic_eps', 'X_q', 'Y_q',
+            'iq_pert', 'q_pair_map', 'unique_pairs',
+            '_psi_size', '_block_offsets_a', '_block_offsets_b', '_block_sizes',
+            '_qspace_sym_data', '_qspace_sym_q_map', 'n_syms_qspace',
+        ]
+        self.__total_attributes__.extend(qspace_attrs)
+
+        # == 1. Get q-space eigenmodes ==
+        ws_sc, pols_sc, w_q, pols_q = self.dyn.DiagonalizeSupercell(return_qmodes=True)
+
+        self.q_points = np.array(self.dyn.q_tot)  # (n_q, 3)
+        self.n_q = len(self.q_points)
+        self.n_bands = 3 * self.uci_structure.N_atoms  # uniform band count
+        self.w_q = w_q        # (n_bands, n_q) from DiagonalizeSupercell
+        self.pols_q = pols_q  # (3*n_at, n_bands, n_q) complex eigenvectors
+
+        # The masses needs to be restricted to the primitive cell only
+        self.m = self.m[:self.n_bands]
+
+        # Small frequency threshold for acoustic mode masking
+        self.acoustic_eps = 1e-6
+
+        # == 2. Bloch transform ensemble data ==
+        self._bloch_transform_ensemble()
+
+        # Q-space specific state (set by build_q_pair_map)
+        self.iq_pert = None
+        self.q_pair_map = None
+        self.unique_pairs = None
+        self._psi_size = None
+        self._block_offsets_a = None
+        self._block_offsets_b = None
+        self._block_sizes = None
+
+        # Symmetry data for q-space
+        self._qspace_sym_data = None
+        self._qspace_sym_q_map = None
+        self.n_syms_qspace = 0
+
+    def _bloch_transform_ensemble(self):
+        """Bloch transform the ensemble displacements and forces into q-space mode basis.
+
+        Computes X_q and Y_q from the ensemble u_disps_qspace and forces_qspace.
+        Uses sscha.Ensemble implementation for the Fourier transform via Julia.
+
+        The forces are the anharmonic residual: f - f_SSCHA - <f - f_SSCHA>,
+        matching the preprocessing done in DynamicalLanczos.__init__.
+        """
+        # Ensure the ensemble has computed q-space quantities
+        # Check if fourier_gradient is active or force it
+        if self.ensemble.u_disps_qspace is None:
+             # Force fourier gradient initialization in the ensemble
+             if not self.ensemble.fourier_gradient:
+                 print("Ensemble checking: computing Fourier transform of displacements and forces...")
+                 self.ensemble.fourier_gradient = True
+             self.ensemble.init()
+
+        # Unit conversion factors
+        # Target: Bohr (u) and Ry/Bohr (f)
+        u_conv = 1.0
+        f_conv = 1.0
+        if self.ensemble.units == "default":
+            u_conv = CC.Units.A_TO_BOHR
+            f_conv = 1.0 / CC.Units.A_TO_BOHR
+        elif self.ensemble.units == "hartree":
+            f_conv = 2.0 # Ha -> Ry
+
+        # Mass scaling factors (sqrt(m) for u, 1/sqrt(m) for f)
+        # We use self.dyn.structure corresponding to unit cell
+        m_uc = self.dyn.structure.get_masses_array()
+        sqrt_m = np.sqrt(m_uc)
+        sqrt_m_3 = np.repeat(sqrt_m, 3) # (3*nat_uc,)
+
+        # Compute the average anharmonic force (matching parent DynamicalLanczos)
+        # get_average_forces returns rho-weighted <f - f_SSCHA> in unit cell, Ry/Angstrom
+        f_mean_uc = self.ensemble.get_average_forces(get_error=False)  # (nat_uc, 3)
+        # Symmetrize the average force
+        qe_sym = CC.symmetries.QE_Symmetry(self.dyn.structure)
+        qe_sym.SetupQPoint()
+        qe_sym.SymmetrizeVector(f_mean_uc)
+        f_mean_flat = f_mean_uc.ravel()  # (3*nat_uc,) in Ry/Angstrom
+
+        # Fourier transform of the constant (tiled) average force:
+        # At Gamma: f_mean_q[Gamma] = sqrt(n_cell) * f_mean_uc
+        # At q != Gamma: f_mean_q[q] = 0
+        n_cell = np.prod(self.dyn.GetSupercell())
+        f_mean_q_gamma = np.sqrt(n_cell) * f_mean_flat  # (3*nat_uc,) complex
+
+        # Allocate q-space arrays
+        self.X_q = np.zeros((self.n_q, self.N, self.n_bands), dtype=np.complex128)
+        self.Y_q = np.zeros((self.n_q, self.N, self.n_bands), dtype=np.complex128)
+
+        # Projection loop
+        for iq in range(self.n_q):
+            # Retrieve ensemble q-space data (N, 3*nat_uc)
+            # Apply conversions and mass scaling
+            u_tilde_q = self.ensemble.u_disps_qspace[:, :, iq] * (u_conv * sqrt_m_3[None, :])
+
+            # Anharmonic force residual: f - f_SSCHA (both in Ry/Angstrom in q-space)
+            delta_f_q = (self.ensemble.forces_qspace[:, :, iq]
+                         - self.ensemble.sscha_forces_qspace[:, :, iq])
+
+            # Subtract the average force (only non-zero at Gamma)
+            if iq == 0:
+                delta_f_q = delta_f_q - f_mean_q_gamma[None, :]
+
+            # Convert to Ry/Bohr and mass-scale
+            f_tilde_q = delta_f_q * (f_conv / sqrt_m_3[None, :])
+
+            # Project onto eigenvector basis: X_q[iq, config, nu] = sum_a conj(pols_q[a, nu, iq]) * u_tilde_q[config, a]
+            # pols_q shape: (3*nat_uc, n_bands, n_q)
+            pol_iq = self.pols_q[:, :, iq]  # (3*nat_uc, n_bands)
+
+            self.X_q[iq, :, :] = u_tilde_q @ np.conj(pol_iq)
+            self.Y_q[iq, :, :] = f_tilde_q @ np.conj(pol_iq)
+
+    def build_q_pair_map(self, iq_pert):
+        """Find all (iq1, iq2) pairs satisfying q1 + q2 = q_pert + G.
+
+        Parameters
+        ----------
+        iq_pert : int
+            Index of the perturbation q-point.
+        """
+        bg = self.uci_structure.get_reciprocal_vectors() / (2 * np.pi)
+        q_pert = self.q_points[iq_pert]
+
+        self.iq_pert = iq_pert
+        self.q_pair_map = np.zeros(self.n_q, dtype=np.int32)
+
+        for iq1 in range(self.n_q):
+            q_target = q_pert - self.q_points[iq1]
+            found = False
+            for iq2 in range(self.n_q):
+                if CC.Methods.get_min_dist_into_cell(bg, q_target, self.q_points[iq2]) < 1e-6:
+                    self.q_pair_map[iq1] = iq2
+                    found = True
+                    break
+            if not found:
+                raise ValueError(
+                    "Could not find partner for q1={} with q_pert={}".format(
+                        self.q_points[iq1], q_pert))
+
+        # Unique pairs: iq1 <= iq2 (avoids double-counting)
+        self.unique_pairs = []
+        for iq1 in range(self.n_q):
+            iq2 = self.q_pair_map[iq1]
+            if iq1 <= iq2:
+                self.unique_pairs.append((iq1, iq2))
+
+        # Pre-compute block layout
+        self._compute_block_layout()
+
+    def _compute_block_layout(self):
+        """Pre-compute the block offsets and sizes for the psi vector."""
+        nb = self.n_bands
+        n_pairs = len(self.unique_pairs)
+
+        # Compute block sizes
+        self._block_sizes = []
+        for iq1, iq2 in self.unique_pairs:
+            if iq1 < iq2:
+                self._block_sizes.append(nb * nb)  # full block
+            else:  # iq1 == iq2
+                self._block_sizes.append(nb * (nb + 1) // 2)  # upper triangle
+
+        # R sector: n_bands entries
+        r_size = nb
+
+        # a' sector offsets
+        self._block_offsets_a = []
+        offset = r_size
+        for size in self._block_sizes:
+            self._block_offsets_a.append(offset)
+            offset += size
+
+        # b' sector offsets
+        self._block_offsets_b = []
+        for size in self._block_sizes:
+            self._block_offsets_b.append(offset)
+            offset += size
+
+        self._psi_size = offset
+
+    def get_psi_size(self):
+        """Return the total size of the psi vector."""
+        if self._psi_size is None:
+            raise ValueError("Must call build_q_pair_map first")
+        return self._psi_size
+
+    def get_block_offset(self, pair_idx, sector='a'):
+        """Get the offset into psi for a given pair index.
+
+        Parameters
+        ----------
+        pair_idx : int
+            Index into self.unique_pairs.
+        sector : str
+            'a' for a' sector, 'b' for b' sector.
+        """
+        if sector == 'a':
+            return self._block_offsets_a[pair_idx]
+        else:
+            return self._block_offsets_b[pair_idx]
+
+    def get_block_size(self, pair_idx):
+        """Get the number of entries for this pair."""
+        return self._block_sizes[pair_idx]
+
+    def get_R1_q(self):
+        """Extract R^(1) from psi (n_bands complex entries at q_pert)."""
+        return self.psi[:self.n_bands].copy()
+
+    def _unpack_upper_triangle(self, flat_data, n):
+        """Unpack upper triangle storage into a full (n, n) matrix.
+
+        Storage order: for i in range(n): M[i, i:] stored contiguously.
+        Off-diagonal: M[j, i] = conj(M[i, j]) for Hermitian matrix.
+        """
+        mat = np.zeros((n, n), dtype=np.complex128)
+        idx = 0
+        for i in range(n):
+            length = n - i
+            mat[i, i:] = flat_data[idx:idx + length]
+            idx += length
+        # Fill lower triangle (Hermitian)
+        for i in range(n):
+            mat[i + 1:, i] = np.conj(mat[i, i + 1:])
+        return mat
+
+    def _pack_upper_triangle(self, mat, n):
+        """Pack a full (n, n) matrix into upper triangle storage."""
+        size = n * (n + 1) // 2
+        flat = np.zeros(size, dtype=np.complex128)
+        idx = 0
+        for i in range(n):
+            length = n - i
+            flat[idx:idx + length] = mat[i, i:]
+            idx += length
+        return flat
+
+    def get_block(self, pair_idx, sector='a'):
+        """Reconstruct full (n_bands, n_bands) matrix from psi storage.
+
+        Parameters
+        ----------
+        pair_idx : int
+            Index into self.unique_pairs.
+        sector : str
+            'a' or 'b'.
+
+        Returns
+        -------
+        ndarray(n_bands, n_bands), complex128
+        """
+        nb = self.n_bands
+        iq1, iq2 = self.unique_pairs[pair_idx]
+        offset = self.get_block_offset(pair_idx, sector)
+        size = self.get_block_size(pair_idx)
+
+        raw = self.psi[offset:offset + size]
+
+        if iq1 < iq2:
+            # Full block, row-major
+            return raw.reshape(nb, nb).copy()
+        else:
+            # Upper triangle storage
+            return self._unpack_upper_triangle(raw, nb)
+
+    def get_a1_block(self, pair_idx):
+        """Get the a'(1) block for pair_idx."""
+        return self.get_block(pair_idx, 'a')
+
+    def get_b1_block(self, pair_idx):
+        """Get the b'(1) block for pair_idx."""
+        return self.get_block(pair_idx, 'b')
+
+    def set_block_in_psi(self, pair_idx, matrix, sector, target_psi):
+        """Write a (n_bands, n_bands) block into the target psi vector.
+
+        Parameters
+        ----------
+        pair_idx : int
+        matrix : ndarray(n_bands, n_bands)
+        sector : str ('a' or 'b')
+        target_psi : ndarray — the psi vector to write into
+        """
+        nb = self.n_bands
+        iq1, iq2 = self.unique_pairs[pair_idx]
+        offset = self.get_block_offset(pair_idx, sector)
+
+        if iq1 < iq2:
+            # Full block
+            target_psi[offset:offset + nb * nb] = matrix.ravel()
+        else:
+            # Upper triangle
+            target_psi[offset:offset + self.get_block_size(pair_idx)] = \
+                self._pack_upper_triangle(matrix, nb)
+
+    # ====================================================================
+    # Step 4: Mask for Hermitian inner product
+    # ====================================================================
+    def mask_dot_wigner(self, debug=False):
+        """Build the mask for Hermitian inner product with upper-triangle storage.
+
+        For full blocks (iq1 < iq2): factor 2 for the conjugate block (iq2, iq1).
+        For diagonal blocks (iq1 == iq2): off-diagonal factor 2, diagonal factor 1.
+
+        Returns
+        -------
+        ndarray(psi_size,), float64
+        """
+        mask = np.ones(self.get_psi_size(), dtype=np.float64)
+
+        for pair_idx, (iq1, iq2) in enumerate(self.unique_pairs):
+            offset_a = self.get_block_offset(pair_idx, 'a')
+            offset_b = self.get_block_offset(pair_idx, 'b')
+            size = self.get_block_size(pair_idx)
+
+            if iq1 < iq2:
+                # Full block: factor 2 for the conjugate block
+                mask[offset_a:offset_a + size] = 2
+                mask[offset_b:offset_b + size] = 2
+            else:  # iq1 == iq2, upper triangle storage
+                nb = self.n_bands
+                idx_a = offset_a
+                idx_b = offset_b
+                for i in range(nb):
+                    # Diagonal element: factor 1
+                    mask[idx_a] = 1
+                    mask[idx_b] = 1
+                    idx_a += 1
+                    idx_b += 1
+                    # Off-diagonal elements: factor 2
+                    for j in range(i + 1, nb):
+                        mask[idx_a] = 2
+                        mask[idx_b] = 2
+                        idx_a += 1
+                        idx_b += 1
+
+        return mask
+
+    # ====================================================================
+    # Step 5: Harmonic operator
+    # ====================================================================
+    def apply_L1_FT(self, transpose=False):
+        """Apply the harmonic part of L in q-space (Wigner formalism).
+
+        L_harm is block-diagonal:
+          R sector: -(w_q_pert[nu])^2 * R[nu]
+          a' sector: -(w1 - w2)^2 * a'
+          b' sector: -(w1 + w2)^2 * b'
+
+        Returns
+        -------
+        ndarray(psi_size,), complex128
+        """
+        out = np.zeros(self.get_psi_size(), dtype=np.complex128)
+
+        if self.ignore_harmonic:
+            return out
+
+        # R sector
+        w_qp = self.w_q[:, self.iq_pert]  # (n_bands,)
+        out[:self.n_bands] = -(w_qp ** 2) * self.psi[:self.n_bands]
+
+        # a' and b' sectors
+        for pair_idx, (iq1, iq2) in enumerate(self.unique_pairs):
+            w1 = self.w_q[:, iq1]  # (n_bands,)
+            w2 = self.w_q[:, iq2]  # (n_bands,)
+            a_block = self.get_a1_block(pair_idx)
+            b_block = self.get_b1_block(pair_idx)
+
+            w_minus2 = np.subtract.outer(w1, w2) ** 2  # (n_bands, n_bands)
+            w_plus2 = np.add.outer(w1, w2) ** 2
+
+            self.set_block_in_psi(pair_idx, -w_minus2 * a_block, 'a', out)
+            self.set_block_in_psi(pair_idx, -w_plus2 * b_block, 'b', out)
+
+        return out
+
+    # ====================================================================
+    # Step 6: Anharmonic operator
+    # ====================================================================
+    def _safe_bose_and_mask(self, iq):
+        """Compute Bose-Einstein occupation for bands at iq, masking acoustic modes.
+
+        Returns
+        -------
+        n : ndarray(n_bands,)
+            Bose-Einstein occupation; 0 for acoustic modes.
+        valid : ndarray(n_bands,), bool
+            True for non-acoustic bands.
+        """
+        w = self.w_q[:, iq]
+        valid = w > self.acoustic_eps
+        n = np.zeros_like(w)
+        if self.T > __EPSILON__:
+            n[valid] = 1.0 / (np.exp(w[valid] * __RyToK__ / self.T) - 1.0)
+        return n, valid
+
+    def get_chi_minus_q(self):
+        """Get chi^- for each unique pair as a list of (n_bands, n_bands) matrices.
+
+        chi^-_{nu1, nu2} = (w1 - w2)(n1 - n2) / (2 * w1 * w2)
+        Entries involving acoustic modes (w < acoustic_eps) are set to 0.
+        """
+        chi_list = []
+        for iq1, iq2 in self.unique_pairs:
+            w1 = self.w_q[:, iq1]
+            w2 = self.w_q[:, iq2]
+            n1, v1 = self._safe_bose_and_mask(iq1)
+            n2, v2 = self._safe_bose_and_mask(iq2)
+            # Outer mask: both bands must be non-acoustic
+            valid_mask = np.outer(v1, v2)
+
+            w1_mat = np.tile(w1, (self.n_bands, 1)).T
+            w2_mat = np.tile(w2, (self.n_bands, 1))
+            n1_mat = np.tile(n1, (self.n_bands, 1)).T
+            n2_mat = np.tile(n2, (self.n_bands, 1))
+
+            # Safe division: use np.where to avoid 0/0
+            denom = 2.0 * w1_mat * w2_mat
+            chi = np.where(valid_mask,
+                           (w1_mat - w2_mat) * (n1_mat - n2_mat) / np.where(valid_mask, denom, 1.0),
+                           0.0)
+            chi_list.append(chi)
+        return chi_list
+
+    def get_chi_plus_q(self):
+        """Get chi^+ for each unique pair as a list of (n_bands, n_bands) matrices.
+
+        chi^+_{nu1, nu2} = (w1 + w2)(1 + n1 + n2) / (2 * w1 * w2)
+        Entries involving acoustic modes (w < acoustic_eps) are set to 0.
+        """
+        chi_list = []
+        for iq1, iq2 in self.unique_pairs:
+            w1 = self.w_q[:, iq1]
+            w2 = self.w_q[:, iq2]
+            n1, v1 = self._safe_bose_and_mask(iq1)
+            n2, v2 = self._safe_bose_and_mask(iq2)
+            valid_mask = np.outer(v1, v2)
+
+            w1_mat = np.tile(w1, (self.n_bands, 1)).T
+            w2_mat = np.tile(w2, (self.n_bands, 1))
+            n1_mat = np.tile(n1, (self.n_bands, 1)).T
+            n2_mat = np.tile(n2, (self.n_bands, 1))
+
+            denom = 2.0 * w1_mat * w2_mat
+            chi = np.where(valid_mask,
+                           (w1_mat + w2_mat) * (1 + n1_mat + n2_mat) / np.where(valid_mask, denom, 1.0),
+                           0.0)
+            chi_list.append(chi)
+        return chi_list
+
+    def get_alpha1_beta1_wigner_q(self, get_alpha=True):
+        """Get the perturbation on alpha (Upsilon) from the q-space psi.
+
+        Transforms a'/b' blocks back to the alpha1 perturbation that the
+        Julia code needs.
+
+        alpha1[iq1, iq2] = (w1*w2/X) * [sqrt(-0.5*chi_minus)*a' - sqrt(0.5*chi_plus)*b']
+
+        Returns
+        -------
+        list of ndarray(n_bands, n_bands) — one per unique pair
+        """
+        chi_minus_list = self.get_chi_minus_q()
+        chi_plus_list = self.get_chi_plus_q()
+
+        alpha1_blocks = []
+        for pair_idx, (iq1, iq2) in enumerate(self.unique_pairs):
+            w1 = self.w_q[:, iq1]
+            w2 = self.w_q[:, iq2]
+            n1, v1 = self._safe_bose_and_mask(iq1)
+            n2, v2 = self._safe_bose_and_mask(iq2)
+            valid_mask = np.outer(v1, v2)
+
+            w1_mat = np.tile(w1, (self.n_bands, 1)).T
+            w2_mat = np.tile(w2, (self.n_bands, 1))
+            n1_mat = np.tile(n1, (self.n_bands, 1)).T
+            n2_mat = np.tile(n2, (self.n_bands, 1))
+
+            X = (1 + 2 * n1_mat) * (1 + 2 * n2_mat) / 8
+            # Safe division for w2_on_X: when acoustic, X→∞ and w→0, set to 0
+            w2_on_X = np.where(valid_mask,
+                               (w1_mat * w2_mat) / np.where(valid_mask, X, 1.0),
+                               0.0)
+            chi_minus = chi_minus_list[pair_idx]
+            chi_plus = chi_plus_list[pair_idx]
+
+            a_block = self.get_a1_block(pair_idx)
+            b_block = self.get_b1_block(pair_idx)
+
+            if get_alpha:
+                new_a = w2_on_X * np.sqrt(-0.5 * chi_minus) * a_block
+                new_b = w2_on_X * np.sqrt(+0.5 * chi_plus) * b_block
+                alpha1 = new_a - new_b
+            else:
+                X_safe = np.where(valid_mask, X, 1.0)
+                new_a = np.where(valid_mask,
+                                 (np.sqrt(-0.5 * chi_minus) / X_safe) * a_block,
+                                 0.0)
+                new_b = np.where(valid_mask,
+                                 (np.sqrt(+0.5 * chi_plus) / X_safe) * b_block,
+                                 0.0)
+                alpha1 = new_a + new_b
+
+            alpha1_blocks.append(alpha1)
+        return alpha1_blocks
+
+    def _flatten_blocks(self, blocks):
+        """Flatten a list of (n_bands, n_bands) blocks into a contiguous array."""
+        return np.concatenate([b.ravel() for b in blocks])
+
+    def _unflatten_blocks(self, flat):
+        """Unflatten a contiguous array back into a list of blocks."""
+        nb = self.n_bands
+        blocks = []
+        offset = 0
+        for iq1, iq2 in self.unique_pairs:
+            size = nb * nb
+            blocks.append(flat[offset:offset + size].reshape(nb, nb))
+            offset += size
+        return blocks
+
+    def apply_anharmonic_FT(self, transpose=False, **kwargs):
+        """Apply the anharmonic part of L in q-space (Wigner formalism).
+
+        Calls the Julia q-space extension to compute the perturbed averages,
+        then assembles the output psi vector.
+
+        Returns
+        -------
+        ndarray(psi_size,), complex128
+        """
+        # If both D3 and D4 are ignored, return zero
+        if self.ignore_v3 and self.ignore_v4:
+            return np.zeros(self.get_psi_size(), dtype=np.complex128)
+
+        import julia.Main
+
+        R1 = self.get_R1_q()
+        # If D3 is ignored, zero out R1 so that D3 weight is zero
+        if self.ignore_v3:
+            R1 = np.zeros_like(R1)
+        alpha1_blocks = self.get_alpha1_beta1_wigner_q(get_alpha=True)
+        alpha1_flat = self._flatten_blocks(alpha1_blocks)
+
+        # Call Julia
+        f_pert, d2v_blocks = self._call_julia_qspace(R1, alpha1_flat)
+
+        # Build output psi
+        final_psi = np.zeros(self.get_psi_size(), dtype=np.complex128)
+
+        # R sector
+        final_psi[:self.n_bands] = f_pert
+
+        # a'/b' sectors
+        chi_minus_list = self.get_chi_minus_q()
+        chi_plus_list = self.get_chi_plus_q()
+
+        for pair_idx, (iq1, iq2) in enumerate(self.unique_pairs):
+            d2v_block = d2v_blocks[pair_idx]
+            pert_a = np.sqrt(-0.5 * chi_minus_list[pair_idx]) * d2v_block
+            pert_b = -np.sqrt(+0.5 * chi_plus_list[pair_idx]) * d2v_block
+            self.set_block_in_psi(pair_idx, pert_a, 'a', final_psi)
+            self.set_block_in_psi(pair_idx, pert_b, 'b', final_psi)
+
+        return final_psi
+
+    def _call_julia_qspace(self, R1, alpha1_flat):
+        """Call the Julia q-space extension with MPI parallelization.
+
+        Same MPI pattern as DynamicalLanczos.apply_anharmonic_FT.
+
+        Returns
+        -------
+        f_pert : ndarray(n_bands,), complex128
+        d2v_blocks : list of ndarray(n_bands, n_bands), complex128
+        """
+        import julia.Main
+
+        n_total = self.n_syms_qspace * self.N
+        n_processors = Parallel.GetNProc()
+
+        count = n_total // n_processors
+        remainer = n_total % n_processors
+        indices = []
+        for rank in range(n_processors):
+            if rank < remainer:
+                start = np.int64(rank * (count + 1))
+                stop = np.int64(start + count + 1)
+            else:
+                start = np.int64(rank * count + remainer)
+                stop = np.int64(start + count)
+            indices.append([start + 1, stop])  # 1-indexed for Julia
+
+        unique_pairs_arr = np.array(self.unique_pairs, dtype=np.int32) + 1  # 1-indexed
+
+        def get_combined(start_end):
+            return julia.Main.get_perturb_averages_qspace(
+                self.X_q, self.Y_q, self.w_q, self.rho,
+                R1, alpha1_flat,
+                np.float64(self.T), bool(not self.ignore_v4),
+                np.int64(self.iq_pert + 1),
+                self.q_pair_map + 1,  # 1-indexed
+                unique_pairs_arr,
+                int(start_end[0]), int(start_end[1])
+            )
+
+        combined = Parallel.GoParallel(get_combined, indices, "+")
+        f_pert = combined[:self.n_bands]
+        d2v_flat = combined[self.n_bands:]
+        d2v_blocks = self._unflatten_blocks(d2v_flat)
+        return f_pert, d2v_blocks
+
+    # ====================================================================
+    # Step 5+6: Combined L application (override apply_full_L)
+    # ====================================================================
+    def apply_full_L(self, target=None, force_t_0=False, force_FT=True,
+                     transpose=False, fast_lanczos=True):
+        """Apply the full L operator in q-space.
+
+        Parameters
+        ----------
+        target : ndarray or None
+            If provided, copy into self.psi first.
+        transpose : bool
+            Not used for Hermitian Lanczos.
+
+        Returns
+        -------
+        ndarray(psi_size,), complex128
+        """
+        if target is not None:
+            self.psi = target.copy()
+
+        result = self.apply_L1_FT(transpose=transpose)
+        result += self.apply_anharmonic_FT(transpose=transpose)
+
+        return result
+
+    # ====================================================================
+    # Step 7: Hermitian Lanczos (run_FT override)
+    # ====================================================================
+    def run_FT(self, n_iter, save_dir=None, save_each=5, verbose=True,
+               n_rep_orth=0, n_ortho=10, flush_output=True, debug=False,
+               prefix="LANCZOS", run_simm=None, optimized=False):
+        """Run the Hermitian Lanczos algorithm for q-space.
+
+        This is the same structure as the parent run_FT but with:
+        1. Forced run_simm = True (Hermitian)
+        2. Hermitian dot products: psi.conj().dot(psi * mask).real
+        3. Complex128 psi
+        4. Real coefficients (guaranteed by Hermitian L)
+        """
+        self.verbose = verbose
+
+        if not self.initialized:
+            if verbose:
+                print('Not initialized. Now we symmetrize\n')
+            self.prepare_symmetrization()
+
+        ERROR_MSG = """
+Error, you must initialize a perturbation to start the Lanczos.
+Use prepare_mode_q or prepare_perturbation_q before calling run_FT.
+"""
+        if self.psi is None:
+            raise ValueError(ERROR_MSG)
+
+        mask_dot = self.mask_dot_wigner(debug)
+        psi_norm = np.real(np.conj(self.psi).dot(self.psi * mask_dot))
+        if np.isnan(psi_norm) or psi_norm == 0:
+            raise ValueError(ERROR_MSG)
+
+        # Force symmetric Lanczos
+        run_simm = True
+
+        if Parallel.am_i_the_master():
+            if save_dir is not None:
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir)
+
+        if verbose:
+            print('Running the Hermitian Lanczos in q-space')
+            print()
+
+        # Get current step
+        i_step = len(self.a_coeffs)
+
+        if verbose:
+            header = """
+<=====================================>
+|                                     |
+|     Q-SPACE LANCZOS ALGORITHM       |
+|                                     |
+<=====================================>
+
+Starting the algorithm.
+Starting from step %d
+""" % i_step
+            print(header)
+
+        # Initialize
+        if i_step == 0:
+            self.basis_Q = []
+            self.basis_P = []
+            self.s_norm = []
+            norm = np.sqrt(np.real(np.conj(self.psi).dot(self.psi * mask_dot)))
+            first_vector = self.psi / norm
+            self.basis_Q.append(first_vector)
+            self.basis_P.append(first_vector)
+            self.s_norm.append(1)
+        else:
+            if verbose:
+                print('Restarting the Lanczos')
+            self.basis_Q = list(self.basis_Q)
+            self.basis_P = list(self.basis_P)
+            self.s_norm = list(self.s_norm)
+            self.a_coeffs = list(self.a_coeffs)
+            self.b_coeffs = list(self.b_coeffs)
+            self.c_coeffs = list(self.c_coeffs)
+
+        psi_q = self.basis_Q[-1]
+        psi_p = self.basis_P[-1]
+
+        next_converged = False
+        converged = False
+
+        for i in range(i_step, i_step + n_iter):
+            if verbose:
+                print("\n ===== NEW STEP %d =====\n" % (i + 1))
+                if flush_output:
+                    sys.stdout.flush()
+
+            # Apply L (Hermitian => p_L = L_q)
+            t1 = time.time()
+            L_q = self.apply_full_L(psi_q)
+            p_L = np.copy(L_q)
+            t2 = time.time()
+
+            # p normalization
+            c_old = 1
+            if len(self.c_coeffs) > 0:
+                c_old = self.c_coeffs[-1]
+            p_norm = self.s_norm[-1] / c_old
+
+            # a coefficient (real for Hermitian L)
+            a_coeff = np.real(np.conj(psi_p).dot(L_q * mask_dot)) * p_norm
+
+            if np.isnan(a_coeff):
+                raise ValueError("Invalid value in Lanczos. Check frequencies/initialization.")
+
+            # Residuals
+            rk = L_q - a_coeff * psi_q
+            if len(self.basis_Q) > 1:
+                rk -= self.c_coeffs[-1] * self.basis_Q[-2]
+
+            sk = p_L - a_coeff * psi_p
+            if len(self.basis_P) > 1:
+                old_p_norm = self.s_norm[-2]
+                if len(self.c_coeffs) >= 2:
+                    old_p_norm = self.s_norm[-2] / self.c_coeffs[-2]
+                sk -= self.b_coeffs[-1] * self.basis_P[-2] * (old_p_norm / p_norm)
+
+            # s_norm
+            s_norm = np.sqrt(np.real(np.conj(sk).dot(sk * mask_dot)))
+            sk_tilde = sk / s_norm
+            s_norm *= p_norm
+
+            # b and c coefficients (real, should be equal for Hermitian L)
+            b_coeff = np.sqrt(np.real(np.conj(rk).dot(rk * mask_dot)))
+            c_coeff = np.real(np.conj(sk_tilde).dot((rk / b_coeff) * mask_dot)) * s_norm
+
+            self.a_coeffs.append(a_coeff)
+
+            if np.abs(b_coeff) < __EPSILON__ or next_converged:
+                if verbose:
+                    print("Converged (b = {})".format(b_coeff))
+                converged = True
+                break
+            if np.abs(c_coeff) < __EPSILON__:
+                if verbose:
+                    print("Converged (c = {})".format(c_coeff))
+                converged = True
+                break
+
+            psi_q = rk / b_coeff
+            psi_p = sk_tilde.copy()
+
+            # Gram-Schmidt
+            new_q = psi_q.copy()
+            new_p = psi_p.copy()
+
+            for k_orth in range(n_rep_orth):
+                start = max(0, len(self.basis_P) - (n_ortho or len(self.basis_P)))
+
+                for j in range(start, len(self.basis_P)):
+                    coeff1 = np.real(np.conj(self.basis_P[j]).dot(new_q * mask_dot))
+                    coeff2 = np.real(np.conj(self.basis_Q[j]).dot(new_p * mask_dot))
+                    new_q -= coeff1 * self.basis_P[j]
+                    new_p -= coeff2 * self.basis_Q[j]
+
+                normq = np.sqrt(np.real(np.conj(new_q).dot(new_q * mask_dot)))
+                if normq < __EPSILON__:
+                    next_converged = True
+                new_q /= normq
+
+                normp = np.real(np.conj(new_p).dot(new_p * mask_dot))
+                if np.abs(normp) < __EPSILON__:
+                    next_converged = True
+                new_p /= normp
+
+                s_norm = c_coeff / np.real(np.conj(new_p).dot(new_q * mask_dot))
+
+            if not converged:
+                self.basis_Q.append(new_q)
+                self.basis_P.append(new_p)
+                psi_q = new_q.copy()
+                psi_p = new_p.copy()
+
+                self.b_coeffs.append(b_coeff)
+                self.c_coeffs.append(c_coeff)
+                self.s_norm.append(s_norm)
+
+            if verbose:
+                print("Time for L application: %d s" % (t2 - t1))
+                print("a_%d = %.8e" % (i, self.a_coeffs[-1]))
+                print("b_%d = %.8e" % (i, self.b_coeffs[-1]))
+                print("c_%d = %.8e" % (i, self.c_coeffs[-1]))
+                print("|b-c| = %.8e" % np.abs(self.b_coeffs[-1] - self.c_coeffs[-1]))
+
+            if save_dir is not None:
+                if (i + 1) % save_each == 0:
+                    self.save_status("%s/%s_STEP%d" % (save_dir, prefix, i + 1))
+
+            if verbose:
+                print("Lanczos step %d completed." % (i + 1))
+
+        if converged and verbose:
+            print("   last a coeff = {}".format(self.a_coeffs[-1]))
+
+    # ====================================================================
+    # Step 8: Perturbation setup
+    # ====================================================================
+    def prepare_mode_q(self, iq, band_index):
+        """Prepare perturbation for mode (q, nu).
+
+        Parameters
+        ----------
+        iq : int
+            Index of the q-point.
+        band_index : int
+            Band index (0-based).
+        """
+        self.build_q_pair_map(iq)
+        self.reset_q()
+        self.psi[band_index] = 1.0 + 0j
+        self.perturbation_modulus = 1.0
+
+    def prepare_ir(self, effective_charges = None, pol_vec = np.array([1.0, 0.0, 0.0])):
+        """
+        PREPARE LANCZOS FOR INFRARED SPECTRUM COMPUTATION
+        =================================================
+
+        In this subroutine we prepare the lanczos algorithm for the computation of the
+        infrared spectrum signal.
+
+        Parameters
+        ----------
+            effective_charges : ndarray(size = (n_atoms, 3, 3), dtype = np.double)
+                The effective charges. Indices are: Number of atoms in the unit cell,
+                electric field component, atomic coordinate. If None, the effective charges
+                contained in the dynamical matrix will be considered.
+            pol_vec : ndarray(size = 3)
+                The polarization vector of the light.
+        """
+
+        ec = self.dyn.effective_charges
+        if not effective_charges is None:
+            ec = effective_charges
+
+        assert not ec is None, "Error, no effective charge found. Cannot initialize IR responce"
+
+        z_eff = np.einsum("abc, b", ec, pol_vec)
+
+        # Get the gamma effective charge
+        new_zeff = z_eff.ravel() / np.sqrt(self.m)
+
+        # This is a Gamma perturbation
+        self.prepare_perturbation_q(0, new_zeff)
+
+    def prepare_perturbation_q(self, iq, vector):
+        """Prepare perturbation at q from a real-space vector (3*n_at_uc,).
+
+        Projects the vector onto q-space eigenmodes at iq.
+
+        Parameters
+        ----------
+        iq : int
+            Index of the q-point.
+        vector : ndarray(3*n_at_uc,)
+            Perturbation vector in Cartesian real space.
+        """
+        self.build_q_pair_map(iq)
+        self.reset_q()
+        m = np.tile(self.uci_structure.get_masses_array(), (3, 1)).T.ravel()
+        v_scaled = vector / np.sqrt(m)
+        R1 = np.conj(self.pols_q[:, :, iq]).T @ v_scaled  # (n_bands,) complex
+        self.psi[:self.n_bands] = R1
+        self.perturbation_modulus = np.real(np.conj(R1) @ R1)
+
+    def reset_q(self):
+        """Reset the Lanczos state for q-space."""
+        n = self.get_psi_size()
+        self.psi = np.zeros(n, dtype=np.complex128)
+
+        self.eigvals = None
+        self.eigvects = None
+
+        self.a_coeffs = []
+        self.b_coeffs = []
+        self.c_coeffs = []
+        self.krilov_basis = []
+        self.basis_P = []
+        self.basis_Q = []
+        self.s_norm = []
+
+    # ====================================================================
+    # Step 11: Q-space symmetry matrix construction
+    # ====================================================================
+    def prepare_symmetrization(self, no_sym=False, verbose=True, symmetries=None):
+        """Build q-space symmetry matrices and cache them in Julia.
+
+        Overrides the parent to build sparse complex symmetry matrices
+        in the q-space mode basis.
+
+        Uses spglib on the unit cell (not supercell) to get correct
+        fractional-coordinate rotations and translations, then converts
+        to Cartesian for the representation matrices.
+        """
+        self.initialized = True
+
+        if no_sym:
+            # Identity only
+            self.n_syms_qspace = 1
+            n_total = self.n_q * self.n_bands
+            # Build identity sparse matrix
+            import julia.Main
+            julia.Main.eval("""
+            function init_identity_qspace(n_total::Int64)
+                I_sparse = SparseArrays.sparse(
+                    Int32.(1:n_total), Int32.(1:n_total),
+                    ComplexF64.(ones(n_total)), n_total, n_total)
+                _cached_qspace_symmetries[] = [I_sparse]
+                return nothing
+            end
+            """)
+            julia.Main.init_identity_qspace(np.int64(n_total))
+            return
+
+        if not __SPGLIB__:
+            raise ImportError("spglib required for symmetrization")
+
+        # Get symmetries from the UNIT CELL directly via spglib.
+        # spglib returns rotations and translations in fractional
+        # (crystal) coordinates. We convert rotations to Cartesian via
+        # R_cart = M @ R_frac @ M^{-1} where M = unit_cell.T.
+        spg_data = spglib.get_symmetry(self.uci_structure.get_spglib_cell())
+        rot_frac_all = spg_data['rotations']   # (n_sym, 3, 3) integer
+        trans_frac_all = spg_data['translations']  # (n_sym, 3) fractional
+
+        M = self.uci_structure.unit_cell.T       # columns = lattice vectors
+        Minv = np.linalg.inv(M)
+
+        # Extract unique point-group rotations (keep first occurrence)
+        unique_pg = {}
+        for i in range(len(rot_frac_all)):
+            key = rot_frac_all[i].tobytes()
+            if key not in unique_pg:
+                unique_pg[key] = i
+        pg_indices = list(unique_pg.values())
+
+        if verbose:
+            print("Q-space: {} PG symmetries from {} total unit cell symmetries".format(
+                len(pg_indices), len(rot_frac_all)))
+
+        self._build_qspace_symmetries(
+            rot_frac_all, trans_frac_all, pg_indices, M, Minv,
+            verbose=verbose)
+
+    @staticmethod
+    def _get_atom_perm(structure, R_cart, t_cart, M, Minv, tol=0.1):
+        """Find atom permutation under symmetry {R|t}.
+
+        Returns irt such that R @ tau[kappa] + t ≡ tau[irt[kappa]] mod lattice.
+        """
+        nat = structure.N_atoms
+        irt = np.zeros(nat, dtype=int)
+        for kappa in range(nat):
+            tau = structure.coords[kappa]
+            mapped = R_cart @ tau + t_cart
+            for kp in range(nat):
+                diff = mapped - structure.coords[kp]
+                diff_frac = Minv @ diff
+                diff_frac -= np.round(diff_frac)
+                if np.linalg.norm(M @ diff_frac) < tol:
+                    irt[kappa] = kp
+                    break
+        return irt
+
+    def _build_qspace_symmetries(self, rot_frac_all, trans_frac_all,
+                                  pg_indices, M, Minv, verbose=True):
+        """Build sparse complex symmetry matrices for q-space modes.
+
+        For each PG symmetry {R|t}:
+          - Maps q -> Rq (permutes q-points)
+          - Rotates bands within each q-block via
+            D_{nu',nu}(iq'<-iq) = conj(pols_q[:,nu',iq']).T @ P_uc(q') @ pols_q[:,nu,iq]
+          - P_uc includes Cartesian rotation, atom permutation, and Bloch phase:
+            P_uc[3*kp:3*kp+3, 3*k:3*k+3] = exp(-2*pi*i * q' . L_k) * R_cart
+            where L_k = R_cart @ tau_k + t_cart - tau_kp is a lattice vector.
+        """
+        import julia.Main
+
+        nat_uc = self.uci_structure.N_atoms
+        bg = self.uci_structure.get_reciprocal_vectors() / (2 * np.pi)
+        n_total = self.n_q * self.n_bands
+        nb = self.n_bands
+
+        n_syms = len(pg_indices)
+        self.n_syms_qspace = n_syms
+
+        # Build all sparse matrices in Python, then pass to Julia
+        all_rows = []
+        all_cols = []
+        all_vals = []
+
+        for i_sym_idx in pg_indices:
+            R_frac = rot_frac_all[i_sym_idx].astype(float)
+            t_frac = trans_frac_all[i_sym_idx]
+
+            # Convert rotation and translation to Cartesian
+            R_cart = M @ R_frac @ Minv
+            t_cart = M @ t_frac
+
+            # Get atom permutation
+            irt = self._get_atom_perm(
+                self.uci_structure, R_cart, t_cart, M, Minv)
+
+            rows, cols, vals = [], [], []
+
+            for iq in range(self.n_q):
+                q = self.q_points[iq]
+                Rq = R_cart @ q
+
+                # Find iq' matching Rq
+                iq_prime = find_q_index(Rq, self.q_points, bg)
+                q_prime = self.q_points[iq_prime]
+
+                # Build P_uc with Bloch phase factor
+                P_uc = np.zeros((3 * nat_uc, 3 * nat_uc), dtype=np.complex128)
+                for kappa in range(nat_uc):
+                    kp = irt[kappa]
+                    tau_k = self.uci_structure.coords[kappa]
+                    tau_kp = self.uci_structure.coords[kp]
+                    # L is the lattice vector: R@tau + t - tau'
+                    L = R_cart @ tau_k + t_cart - tau_kp
+                    phase = np.exp(-2j * np.pi * q_prime @ L)
+                    P_uc[3 * kp:3 * kp + 3,
+                         3 * kappa:3 * kappa + 3] = phase * R_cart
+
+                # D block: representation matrix in eigenvector basis
+                D = np.conj(self.pols_q[:, :, iq_prime]).T @ P_uc @ self.pols_q[:, :, iq]
+
+                # Add to sparse entries
+                for nu1 in range(nb):
+                    for nu2 in range(nb):
+                        if abs(D[nu1, nu2]) > 1e-12:
+                            rows.append(iq_prime * nb + nu1)
+                            cols.append(iq * nb + nu2)
+                            vals.append(D[nu1, nu2])
+
+            all_rows.append(np.array(rows, dtype=np.int32))
+            all_cols.append(np.array(cols, dtype=np.int32))
+            all_vals.append(np.array(vals, dtype=np.complex128))
+
+        # Pass to Julia for caching (convert to 1-indexed)
+        for i in range(n_syms):
+            all_rows[i] += 1
+            all_cols[i] += 1
+
+        julia.Main.eval("""
+        function init_sparse_symmetries_qspace(
+            all_rows::Vector{Vector{Int32}},
+            all_cols::Vector{Vector{Int32}},
+            all_vals::Vector{Vector{ComplexF64}},
+            n_total::Int64
+        )
+            n_syms = length(all_rows)
+            mats = Vector{SparseMatrixCSC{ComplexF64,Int32}}(undef, n_syms)
+            for i in 1:n_syms
+                mats[i] = sparse(
+                    all_rows[i], all_cols[i], all_vals[i], n_total, n_total)
+            end
+            _cached_qspace_symmetries[] = mats
+            return nothing
+        end
+        """)
+
+        julia.Main.init_sparse_symmetries_qspace(
+            all_rows, all_cols, all_vals, np.int64(n_total))
+
+        if verbose:
+            print("Q-space symmetry matrices ({} x {}), {} symmetries cached in Julia".format(
+                n_total, n_total, n_syms))
+
+    # Override init to use q-space symmetrization
+    def init(self, use_symmetries=True):
+        """Initialize the q-space Lanczos calculation."""
+        self.prepare_symmetrization(no_sym=not use_symmetries)
+        self.initialized = True
diff --git a/Modules/__init__.py b/Modules/__init__.py
index 0f624e55..e85abf36 100644
--- a/Modules/__init__.py
+++ b/Modules/__init__.py
@@ -5,4 +5,4 @@
 """
 
 
-__all__ = ["DynamicalLanczos", "cli"]
+__all__ = ["DynamicalLanczos", "QSpaceLanczos", "cli"]
diff --git a/Modules/tdscha_qspace.jl b/Modules/tdscha_qspace.jl
new file mode 100644
index 00000000..e52580b7
--- /dev/null
+++ b/Modules/tdscha_qspace.jl
@@ -0,0 +1,529 @@
+# Q-Space Lanczos Julia Extension
+# ================================
+#
+# This module implements the q-space perturbed average calculations
+# in Julia for performance. The q-space version exploits Bloch's
+# theorem so that translations are handled via Fourier transform,
+# and only point-group symmetries remain.
+#
+# The q-space ensemble data X_q and Y_q are complex arrays indexed
+# as (n_q, n_configs, n_bands).
+#
+# Three separate functions mirror the real-space tdscha_core.jl:
+#   get_d2v_from_R_pert_qspace  — D3 contribution to d2v
+#   get_d2v_from_Y_pert_qspace  — D4 contribution to d2v
+#   get_f_from_Y_pert_qspace    — D4 contribution to f_pert
+
+using SparseArrays
+using LinearAlgebra
+using LinearAlgebra.BLAS
+
+if !isdefined(@__MODULE__, :RY_TO_K_Q)
+    const RY_TO_K_Q = 157887.32400374097
+end
+
+# Global cache for q-space symmetry matrices (complex)
+if !isdefined(@__MODULE__, :_cached_qspace_symmetries)
+    const _cached_qspace_symmetries = Ref{Union{Nothing,Vector{SparseMatrixCSC{ComplexF64,Int32}}}}(nothing)
+end
+
+
+"""
+    get_d2v_from_R_pert_qspace(...)
+
+D3 contribution to d2v from R^(1) perturbation.
+Mirrors get_d2v_dR2_from_R_pert_sym_fast in tdscha_core.jl.
+
+For each (config, sym):
+  weight_R  = sum_nu f_Y[nu,iq_pert] * x_rot[iq_pert,nu] * conj(R1[nu]) * rho/3
+  weight_Rf = sum_nu conj(R1[nu]) * y_rot[iq_pert,nu] * rho/3
+
+  For each pair p = (q1,q2):
+    d2v[p] += -weight_R  * (r1_q1 * conj(r2_q2)^T + r2_q1 * conj(r1_q2)^T)
+    d2v[p] += -weight_Rf * r1_q1 * conj(r1_q2)^T
+
+where r1 = f_Y * x, r2 = y.
+"""
+function get_d2v_from_R_pert_qspace(
+    X_q::Array{ComplexF64,3},
+    Y_q::Array{ComplexF64,3},
+    f_Y::Matrix{Float64},
+    rho::Vector{Float64},
+    R1::Vector{ComplexF64},
+    symmetries::Vector{SparseMatrixCSC{ComplexF64,Int32}},
+    iq_pert::Int64,
+    unique_pairs::Matrix{Int32},
+    n_bands::Int64,
+    n_q::Int64,
+    start_index::Int64,
+    end_index::Int64
+)
+    n_pairs = size(unique_pairs, 1)
+    n_syms = length(symmetries)
+    n_total = n_q * n_bands
+    N_eff = sum(rho)
+
+    # Output
+    d2v_blocks = [zeros(ComplexF64, n_bands, n_bands) for _ in 1:n_pairs]
+
+    # Buffers
+    x_buf = zeros(ComplexF64, n_total)
+    y_buf = zeros(ComplexF64, n_total)
+
+    for bigindex in start_index:end_index
+        i_config = div(bigindex - 1, n_syms) + 1
+        j_sym = mod(bigindex - 1, n_syms) + 1
+
+        # Build combined vector for this config
+        for iq in 1:n_q
+            for nu in 1:n_bands
+                idx = (iq - 1) * n_bands + nu
+                x_buf[idx] = X_q[iq, i_config, nu]
+                y_buf[idx] = Y_q[iq, i_config, nu]
+            end
+        end
+
+        # Apply symmetry
+        x_rot = symmetries[j_sym] * x_buf
+        y_rot = symmetries[j_sym] * y_buf
+
+        # Views at q_pert
+        x_pert = view(x_rot, (iq_pert-1)*n_bands+1:iq_pert*n_bands)
+        y_pert = view(y_rot, (iq_pert-1)*n_bands+1:iq_pert*n_bands)
+
+        # weight_R = sum_nu f_Y[nu,iq_pert] * x_pert[nu] * conj(R1[nu]) * rho/3
+        weight_R = zero(ComplexF64)
+        for nu in 1:n_bands
+            weight_R += f_Y[nu, iq_pert] * x_pert[nu] * conj(R1[nu])
+        end
+        weight_R *= rho[i_config] / 3.0
+
+        # weight_Rf = sum_nu conj(R1[nu]) * y_pert[nu] * rho/3
+        weight_Rf = zero(ComplexF64)
+        for nu in 1:n_bands
+            weight_Rf += conj(R1[nu]) * y_pert[nu]
+        end
+        weight_Rf *= rho[i_config] / 3.0
+
+        # Accumulate d2v for each pair
+        for p in 1:n_pairs
+            iq1 = unique_pairs[p, 1]
+            iq2 = unique_pairs[p, 2]
+
+            x_q1 = view(x_rot, (iq1-1)*n_bands+1:iq1*n_bands)
+            y_q1 = view(y_rot, (iq1-1)*n_bands+1:iq1*n_bands)
+            x_q2 = view(x_rot, (iq2-1)*n_bands+1:iq2*n_bands)
+            y_q2 = view(y_rot, (iq2-1)*n_bands+1:iq2*n_bands)
+
+            for nu1 in 1:n_bands
+                r1_1 = f_Y[nu1, iq1] * x_q1[nu1]  # r1 at q1
+                r2_1 = y_q1[nu1]                     # r2 at q1
+                for nu2 in 1:n_bands
+                    r1_2 = f_Y[nu2, iq2] * x_q2[nu2]  # r1 at q2
+                    r2_2 = y_q2[nu2]                     # r2 at q2
+
+                    # -weight_R * (r1_q1 * conj(r2_q2)^T + r2_q1 * conj(r1_q2)^T)
+                    contrib = -weight_R * (r1_1 * conj(r2_2) + r2_1 * conj(r1_2))
+                    # -weight_Rf * r1_q1 * conj(r1_q2)^T
+                    contrib -= weight_Rf * r1_1 * conj(r1_2)
+
+                    d2v_blocks[p][nu1, nu2] += contrib
+                end
+            end
+        end
+    end
+
+    # Normalize
+    norm_factor = n_syms * N_eff
+    for p in 1:n_pairs
+        d2v_blocks[p] ./= norm_factor
+    end
+
+    return d2v_blocks
+end
+
+
+"""
+    get_d2v_from_Y_pert_qspace(...)
+
+D4 contribution to d2v from alpha1 (Y1/Upsilon) perturbation.
+Mirrors get_d2v_dR2_from_Y_pert_sym_fast in tdscha_core.jl.
+
+CRITICAL: weights are TOTAL (summed over ALL pairs), not per-pair.
+
+For each (config, sym):
+  1. Compute buffer_u(q,nu) = sum_nu2 alpha1(q, q_pert-q)[nu,nu2] * x(q_pert-q, nu2)
+  2. Compute total_wD4 = -sum_all_pairs x^H * alpha1 * x, scaled by rho/8
+  3. Compute total_wb = -sum_{q,nu} conj(buffer_u) * f_psi * y, scaled by rho/4
+  4. Apply TOTAL weights to ALL d2v blocks
+"""
+function get_d2v_from_Y_pert_qspace(
+    X_q::Array{ComplexF64,3},
+    Y_q::Array{ComplexF64,3},
+    f_Y::Matrix{Float64},
+    f_psi::Matrix{Float64},
+    rho::Vector{Float64},
+    alpha1_blocks::Vector{Matrix{ComplexF64}},
+    symmetries::Vector{SparseMatrixCSC{ComplexF64,Int32}},
+    iq_pert::Int64,
+    unique_pairs::Matrix{Int32},
+    n_bands::Int64,
+    n_q::Int64,
+    start_index::Int64,
+    end_index::Int64
+)
+    n_pairs = size(unique_pairs, 1)
+    n_syms = length(symmetries)
+    n_total = n_q * n_bands
+    N_eff = sum(rho)
+
+    # Output
+    d2v_blocks = [zeros(ComplexF64, n_bands, n_bands) for _ in 1:n_pairs]
+
+    # Buffers
+    x_buf = zeros(ComplexF64, n_total)
+    y_buf = zeros(ComplexF64, n_total)
+    buffer_u = zeros(ComplexF64, n_q, n_bands)
+
+    for bigindex in start_index:end_index
+        i_config = div(bigindex - 1, n_syms) + 1
+        j_sym = mod(bigindex - 1, n_syms) + 1
+
+        # Build combined vector
+        for iq in 1:n_q
+            for nu in 1:n_bands
+                idx = (iq - 1) * n_bands + nu
+                x_buf[idx] = X_q[iq, i_config, nu]
+                y_buf[idx] = Y_q[iq, i_config, nu]
+            end
+        end
+
+        # Apply symmetry
+        x_rot = symmetries[j_sym] * x_buf
+        y_rot = symmetries[j_sym] * y_buf
+
+        # Step 1: Compute buffer_u and total_wD4
+        # buffer_u[iq1, nu1] = sum_nu2 alpha1[p][nu1, nu2] * x_rot[iq2, nu2]
+        # where (iq1, iq2) is the pair containing iq1
+        # total_wD4 = -sum_pairs (multiplicity) * x_q1^H * alpha1 * x_q2
+        total_wD4 = zero(ComplexF64)
+        fill!(buffer_u, zero(ComplexF64))
+
+        for p in 1:n_pairs
+            iq1 = unique_pairs[p, 1]
+            iq2 = unique_pairs[p, 2]
+
+            x_q1 = view(x_rot, (iq1-1)*n_bands+1:iq1*n_bands)
+            x_q2 = view(x_rot, (iq2-1)*n_bands+1:iq2*n_bands)
+
+            # buffer_u at iq1: sum_nu2 alpha1[p][nu1, nu2] * x_q2[nu2]
+            for nu1 in 1:n_bands
+                for nu2 in 1:n_bands
+                    buffer_u[iq1, nu1] += alpha1_blocks[p][nu1, nu2] * x_q2[nu2]
+                end
+            end
+
+            # If iq1 != iq2, also accumulate buffer_u at iq2
+            # buffer_u at iq2: sum_nu1 alpha1[p][nu1, nu2]^H * x_q1[nu1]
+            # = sum_nu1 conj(alpha1[p][nu2, nu1]) * x_q1[nu1] if alpha1 is not Hermitian
+            # Actually: alpha1[p] connects (iq1, iq2). For the reverse pair (iq2, iq1),
+            # the alpha1 would be alpha1[p]^T (since alpha1 is symmetric in the real-space version).
+            # In q-space with Hermitian Upsilon: alpha1(q2,q1) = alpha1(q1,q2)^T
+            if iq1 != iq2
+                for nu2 in 1:n_bands
+                    for nu1 in 1:n_bands
+                        buffer_u[iq2, nu2] += alpha1_blocks[p][nu1, nu2] * x_q1[nu1]
+                    end
+                end
+            end
+
+            # Total weight: conj(x_q1)^T * alpha1 * x_q2
+            local_w = zero(ComplexF64)
+            for nu1 in 1:n_bands
+                for nu2 in 1:n_bands
+                    local_w += conj(x_q1[nu1]) * alpha1_blocks[p][nu1, nu2] * x_q2[nu2]
+                end
+            end
+            # Multiplicity: 2 if iq1 < iq2 (conjugate pair counted), 1 if iq1 == iq2
+            mult = (iq1 < iq2) ? 2 : 1
+            total_wD4 += mult * local_w
+        end
+        total_wD4 *= -rho[i_config] / 8.0
+
+        # Step 2: Compute total_wb = -sum_{q,nu} conj(buffer_u[q,nu]) * f_psi[nu,q] * y_rot[q,nu]
+        total_wb = zero(ComplexF64)
+        for iq in 1:n_q
+            for nu in 1:n_bands
+                y_val = y_rot[(iq-1)*n_bands + nu]
+                total_wb -= conj(buffer_u[iq, nu]) * f_psi[nu, iq] * y_val
+            end
+        end
+        total_wb *= rho[i_config] / 4.0
+
+        # Step 3: Apply TOTAL weights to ALL d2v blocks
+        for p in 1:n_pairs
+            iq1 = unique_pairs[p, 1]
+            iq2 = unique_pairs[p, 2]
+
+            x_q1 = view(x_rot, (iq1-1)*n_bands+1:iq1*n_bands)
+            y_q1 = view(y_rot, (iq1-1)*n_bands+1:iq1*n_bands)
+            x_q2 = view(x_rot, (iq2-1)*n_bands+1:iq2*n_bands)
+            y_q2 = view(y_rot, (iq2-1)*n_bands+1:iq2*n_bands)
+
+            for nu1 in 1:n_bands
+                r1_1 = f_Y[nu1, iq1] * x_q1[nu1]
+                r2_1 = y_q1[nu1]
+                for nu2 in 1:n_bands
+                    r1_2 = f_Y[nu2, iq2] * x_q2[nu2]
+                    r2_2 = y_q2[nu2]
+
+                    # -total_wD4 * (r1*conj(r2)^T + r2*conj(r1)^T)
+                    contrib = -total_wD4 * (r1_1 * conj(r2_2) + r2_1 * conj(r1_2))
+                    # -total_wb * r1*conj(r1)^T
+                    contrib -= total_wb * r1_1 * conj(r1_2)
+
+                    d2v_blocks[p][nu1, nu2] += contrib
+                end
+            end
+        end
+    end
+
+    # Normalize
+    norm_factor = n_syms * N_eff
+    for p in 1:n_pairs
+        d2v_blocks[p] ./= norm_factor
+    end
+
+    return d2v_blocks
+end
+
+
+"""
+    get_f_from_Y_pert_qspace(...)
+
+D4 contribution to f_pert from alpha1 perturbation.
+Mirrors get_f_average_from_Y_pert in tdscha_core.jl.
+
+For each (config, sym):
+  total_sum = sum_pairs (mult) * conj(x_q1)^T * alpha1 * x_q2
+  buffer_u(q,nu) = sum_nu2 alpha1[...] * x(q2,nu2)
+  buf_f_weight = sum_{q,nu} conj(buffer_u) * f_psi * y
+
+  f_pert += (-total_sum/2) * rho/3 * y_rot[q_pert]
+  f_pert += (-buf_f_weight) * rho/3 * f_Y[q_pert] * x_rot[q_pert]
+"""
+function get_f_from_Y_pert_qspace(
+    X_q::Array{ComplexF64,3},
+    Y_q::Array{ComplexF64,3},
+    f_Y::Matrix{Float64},
+    f_psi::Matrix{Float64},
+    rho::Vector{Float64},
+    alpha1_blocks::Vector{Matrix{ComplexF64}},
+    symmetries::Vector{SparseMatrixCSC{ComplexF64,Int32}},
+    iq_pert::Int64,
+    unique_pairs::Matrix{Int32},
+    n_bands::Int64,
+    n_q::Int64,
+    start_index::Int64,
+    end_index::Int64
+)
+    n_pairs = size(unique_pairs, 1)
+    n_syms = length(symmetries)
+    n_total = n_q * n_bands
+    N_eff = sum(rho)
+
+    # Output
+    f_pert = zeros(ComplexF64, n_bands)
+
+    # Buffers
+    x_buf = zeros(ComplexF64, n_total)
+    y_buf = zeros(ComplexF64, n_total)
+    buffer_u = zeros(ComplexF64, n_q, n_bands)
+
+    for bigindex in start_index:end_index
+        i_config = div(bigindex - 1, n_syms) + 1
+        j_sym = mod(bigindex - 1, n_syms) + 1
+
+        # Build combined vector
+        for iq in 1:n_q
+            for nu in 1:n_bands
+                idx = (iq - 1) * n_bands + nu
+                x_buf[idx] = X_q[iq, i_config, nu]
+                y_buf[idx] = Y_q[iq, i_config, nu]
+            end
+        end
+
+        # Apply symmetry
+        x_rot = symmetries[j_sym] * x_buf
+        y_rot = symmetries[j_sym] * y_buf
+
+        # Compute buffer_u and total_sum (same as d2v function)
+        total_sum = zero(ComplexF64)
+        fill!(buffer_u, zero(ComplexF64))
+
+        for p in 1:n_pairs
+            iq1 = unique_pairs[p, 1]
+            iq2 = unique_pairs[p, 2]
+
+            x_q1 = view(x_rot, (iq1-1)*n_bands+1:iq1*n_bands)
+            x_q2 = view(x_rot, (iq2-1)*n_bands+1:iq2*n_bands)
+
+            # buffer_u at iq1
+            for nu1 in 1:n_bands
+                for nu2 in 1:n_bands
+                    buffer_u[iq1, nu1] += alpha1_blocks[p][nu1, nu2] * x_q2[nu2]
+                end
+            end
+
+            # buffer_u at iq2 (if not diagonal pair)
+            if iq1 != iq2
+                for nu2 in 1:n_bands
+                    for nu1 in 1:n_bands
+                        buffer_u[iq2, nu2] += alpha1_blocks[p][nu1, nu2] * x_q1[nu1]
+                    end
+                end
+            end
+
+            # total_sum
+            local_w = zero(ComplexF64)
+            for nu1 in 1:n_bands
+                for nu2 in 1:n_bands
+                    local_w += conj(x_q1[nu1]) * alpha1_blocks[p][nu1, nu2] * x_q2[nu2]
+                end
+            end
+            mult = (iq1 < iq2) ? 2 : 1
+            total_sum += mult * local_w
+        end
+
+        # buf_f_weight = sum_{q,nu} conj(buffer_u[q,nu]) * f_psi[nu,q] * y_rot[q,nu]
+        buf_f_weight = zero(ComplexF64)
+        for iq in 1:n_q
+            for nu in 1:n_bands
+                y_val = y_rot[(iq-1)*n_bands + nu]
+                buf_f_weight += conj(buffer_u[iq, nu]) * f_psi[nu, iq] * y_val
+            end
+        end
+
+        # Two f_pert contributions:
+        # Term 1: (-total_sum/2) * rho/3 * y_rot[q_pert]
+        y_pert = view(y_rot, (iq_pert-1)*n_bands+1:iq_pert*n_bands)
+        x_pert = view(x_rot, (iq_pert-1)*n_bands+1:iq_pert*n_bands)
+
+        w1 = -total_sum / 2.0 * rho[i_config] / 3.0
+        for nu in 1:n_bands
+            f_pert[nu] += w1 * y_pert[nu]
+        end
+
+        # Term 2: (-buf_f_weight) * rho/3 * f_Y[nu,q_pert] * x_rot[q_pert,nu]
+        w2 = -buf_f_weight * rho[i_config] / 3.0
+        for nu in 1:n_bands
+            f_pert[nu] += w2 * f_Y[nu, iq_pert] * x_pert[nu]
+        end
+    end
+
+    # Normalize
+    f_pert ./= (n_syms * N_eff)
+
+    return f_pert
+end
+
+
+"""
+    get_perturb_averages_qspace(...)
+
+Combined entry point called from Python. Computes f_pert and d2v_blocks,
+packs them into a single flat array for MPI reduction.
+
+Returns: [f_pert(n_bands) ; d2v_blocks_flat(n_pairs * n_bands^2)]
+"""
+function get_perturb_averages_qspace(
+    X_q::Array{ComplexF64,3},
+    Y_q::Array{ComplexF64,3},
+    w_q::Matrix{Float64},
+    rho::Vector{Float64},
+    R1::Vector{ComplexF64},
+    alpha1_flat::Vector{ComplexF64},
+    temperature::Float64,
+    apply_v4::Bool,
+    iq_pert::Int64,
+    q_pair_map::Vector{Int32},
+    unique_pairs::Matrix{Int32},
+    start_index::Int64,
+    end_index::Int64
+)
+    n_q = size(X_q, 1)
+    n_bands = size(X_q, 3)
+    n_pairs = size(unique_pairs, 1)
+
+    # Get symmetries
+    symmetries = _cached_qspace_symmetries[]
+    if symmetries === nothing
+        error("Q-space symmetries not initialized. Call init_sparse_symmetries_qspace first.")
+    end
+
+    # Precompute occupation numbers and scaling factors
+    # Acoustic modes (w < threshold) get f_Y=0, f_psi=0 to avoid NaN/Inf
+    f_Y = zeros(Float64, n_bands, n_q)
+    f_psi = zeros(Float64, n_bands, n_q)
+    acoustic_eps = 1e-6
+
+    for iq in 1:n_q
+        for nu in 1:n_bands
+            w = w_q[nu, iq]
+            if w < acoustic_eps
+                f_Y[nu, iq] = 0.0
+                f_psi[nu, iq] = 0.0
+                continue
+            end
+            if temperature > 0
+                nw = 1.0 / (exp(w * RY_TO_K_Q / temperature) - 1.0)
+            else
+                nw = 0.0
+            end
+            f_Y[nu, iq] = 2.0 * w / (1.0 + 2.0 * nw)
+            f_psi[nu, iq] = (1.0 + 2.0 * nw) / (2.0 * w)
+        end
+    end
+
+    # Unpack alpha1 blocks
+    alpha1_blocks = Vector{Matrix{ComplexF64}}(undef, n_pairs)
+    offset = 1
+    for p in 1:n_pairs
+        alpha1_blocks[p] = reshape(alpha1_flat[offset:offset+n_bands^2-1], n_bands, n_bands)
+        offset += n_bands^2
+    end
+
+    # D3 contribution to d2v
+    d2v = get_d2v_from_R_pert_qspace(
+        X_q, Y_q, f_Y, rho, R1, symmetries,
+        iq_pert, unique_pairs, n_bands, n_q,
+        start_index, end_index)
+
+    # D4 contributions
+    f_pert = zeros(ComplexF64, n_bands)
+    if apply_v4
+        d2v_v4 = get_d2v_from_Y_pert_qspace(
+            X_q, Y_q, f_Y, f_psi, rho, alpha1_blocks, symmetries,
+            iq_pert, unique_pairs, n_bands, n_q,
+            start_index, end_index)
+        for p in 1:n_pairs
+            d2v[p] .+= d2v_v4[p]
+        end
+
+        f_pert = get_f_from_Y_pert_qspace(
+            X_q, Y_q, f_Y, f_psi, rho, alpha1_blocks, symmetries,
+            iq_pert, unique_pairs, n_bands, n_q,
+            start_index, end_index)
+    end
+
+    # Pack result: f_pert followed by flattened d2v blocks
+    result = zeros(ComplexF64, n_bands + n_pairs * n_bands^2)
+    result[1:n_bands] = f_pert
+    offset = n_bands + 1
+    for p in 1:n_pairs
+        result[offset:offset+n_bands^2-1] = vec(d2v[p])
+        offset += n_bands^2
+    end
+
+    return result
+end
diff --git a/docs/api/qspace_lanczos.md b/docs/api/qspace_lanczos.md
new file mode 100644
index 00000000..2ec38d82
--- /dev/null
+++ b/docs/api/qspace_lanczos.md
@@ -0,0 +1,35 @@
+# QSpaceLanczos Module
+
+::: tdscha.QSpaceLanczos
+    options:
+      show_root_heading: true
+      show_source: true
+      heading_level: 2
+      members_order: source
+
+## QSpaceLanczos Class
+
+::: tdscha.QSpaceLanczos.QSpaceLanczos
+    options:
+      heading_level: 2
+      show_source: true
+      members_order: source
+      filters: ["!^_", "!^__"]
+
+### Key Differences from Lanczos
+
+| Feature | `Lanczos` (real-space) | `QSpaceLanczos` (q-space) |
+|---------|----------------------|--------------------------|
+| Psi vector | Real (`float64`) | Complex (`complex128`) |
+| Inner product | Standard dot product | Hermitian: $\langle p | q \rangle = \bar{p}^T (q \cdot m)$ |
+| Lanczos type | Bi-conjugate or symmetric | Hermitian symmetric ($b = c$, real coefficients) |
+| Two-phonon pairs | All supercell mode pairs | $(q_1, q_2)$ pairs with $q_1 + q_2 = q_\text{pert}$ |
+| Symmetries | Full space group | Point group only (translations analytic) |
+| Backend | C / MPI / Julia | Julia only |
+
+### Utility Functions
+
+::: tdscha.QSpaceLanczos.find_q_index
+    options:
+      heading_level: 3
+      show_source: true
diff --git a/docs/index.md b/docs/index.md
index d66b4b1b..22ca87d2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -18,14 +18,16 @@ Part of the SSCHA ecosystem: `cellconstructor` → `python-sscha` → `tdscha`.
 
 6. **API Reference** - Automatically generated documentation:
    - [DynamicalLanczos](api/dynamical_lanczos.md) - Core Lanczos algorithm
+   - [QSpaceLanczos](api/qspace_lanczos.md) - Q-space (Bloch basis) Lanczos algorithm
    - [StaticHessian](api/static_hessian.md) - Free energy Hessian calculations
 
 ## Key Features
 
 - **Simulating the vibrational anharmonic spectra**: IR absorption, Raman scattering, Nuclear inelastic scattering, and phonon spectral functions.
-- **Full quantum treatment of atomic nuclei** 
+- **Full quantum treatment of atomic nuclei**
 - **Parallel execution** with MPI
 - **Symmetry-aware** calculations for efficiency
+- **Q-space Lanczos**: Bloch-basis formulation exploiting momentum conservation for large supercells (see [In-Depth Usage](usage.md#q-space-lanczos))
 
 ## Theoretical Foundation
 
diff --git a/docs/usage.md b/docs/usage.md
index ad8d4bca..0315fac9 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -222,6 +222,158 @@ gf = lanczos.get_green_function_continued_fraction(
 ```
 
 
+## Q-Space Lanczos
+
+The `QSpaceLanczos` module provides a reformulation of the Lanczos algorithm in q-space (Bloch basis), exploiting momentum conservation from Bloch's theorem. This can give a significant speedup over the standard real-space calculation, proportional to the number of unit cells in the supercell ($N_\text{cell}$).
+
+### How It Works
+
+In the standard real-space Lanczos, the two-phonon sector has dimension $\sim n_\text{modes}^2 / 2$, where $n_\text{modes} = 3 N_\text{atoms,sc}$ is the number of supercell modes. For a 4x4x4 supercell of a 2-atom unit cell, this means $\sim 460{,}000$ entries.
+
+By working in q-space, momentum conservation ($\mathbf{q}_1 + \mathbf{q}_2 = \mathbf{q}_\text{pert} + \mathbf{G}$) makes the two-phonon sector **block-diagonal**: only pairs of q-points satisfying the conservation law contribute. The total two-phonon size drops from $\sim n_\text{modes}^2/2$ to $\sim N_\text{cell} \cdot n_\text{bands}^2/2$ (where $n_\text{bands} = 3 N_\text{atoms,uc}$). Moreover, translational symmetries are handled analytically via the Fourier transform, so only point-group symmetries need to be applied explicitly.
+
+### Requirements
+
+- **Julia** must be installed and configured (see [Installation](installation.md))
+- **spglib** for symmetry detection
+- The Q-space Lanczos always uses the **Wigner representation** and **Julia backend**
+
+### Basic Usage
+
+The workflow mirrors the standard Lanczos but uses `QSpaceLanczos` instead of `Lanczos`, and specifies perturbations by q-point index and band index rather than supercell mode index.
+
+```python
+import cellconstructor as CC
+import cellconstructor.Phonons
+import sscha, sscha.Ensemble
+import tdscha.QSpaceLanczos as QL
+
+# Load ensemble (same as standard workflow)
+dyn = CC.Phonons.Phonons("dyn_final_", nqirr=3)
+ens = sscha.Ensemble.Ensemble(dyn, 300)
+ens.load_bin("ensemble_dir", 1)
+
+# Create the Q-space Lanczos object
+qlanc = QL.QSpaceLanczos(ens)
+qlanc.init(use_symmetries=True)
+
+# Perturb a specific phonon mode at q-point 0 (Gamma), band 3
+qlanc.prepare_mode_q(iq=0, band_index=3)
+
+# Run the Lanczos iteration
+qlanc.run_FT(100)
+
+# Extract the Green function (same interface as standard Lanczos)
+import numpy as np
+w = np.linspace(0, 500, 1000) / CC.Units.RY_TO_CM  # frequency grid in Ry
+smearing = 5 / CC.Units.RY_TO_CM
+gf = qlanc.get_green_function_continued_fraction(w, smearing=smearing)
+spectral = -np.imag(gf)
+```
+
+### Choosing the Perturbation
+
+#### Single mode at a q-point
+
+To perturb a single phonon band at a specific q-point:
+
+```python
+qlanc.prepare_mode_q(iq, band_index)
+```
+
+- `iq`: index of the q-point in `qlanc.q_points` (0 = $\Gamma$)
+- `band_index`: band index (0-based, excluding acoustic modes is your responsibility)
+
+You can inspect the available q-points and frequencies:
+
+```python
+# Print q-points (Cartesian, 2pi/alat units)
+for i, q in enumerate(qlanc.q_points):
+    print(f"iq={i}: q = {q}")
+
+# Print band frequencies at a given q-point (in cm-1)
+iq = 0
+for nu in range(qlanc.n_bands):
+    freq = qlanc.w_q[nu, iq] * CC.Units.RY_TO_CM
+    print(f"  band {nu}: {freq:.2f} cm-1")
+```
+
+#### Custom perturbation vector
+
+To perturb with an arbitrary displacement pattern at a q-point (e.g., for IR-like perturbations):
+
+```python
+# vector is a real-space displacement pattern (3*n_atoms_uc,) in Cartesian coords
+qlanc.prepare_perturbation_q(iq, vector)
+```
+
+This projects the vector onto the q-space eigenmodes at `iq`.
+
+### Looping Over Multiple Modes
+
+To compute the full spectral function, you need to run one Lanczos calculation per mode.
+You can reset the state and prepare a new perturbation between runs:
+
+```python
+import cellconstructor as CC
+import cellconstructor.Phonons
+import sscha, sscha.Ensemble
+import tdscha.QSpaceLanczos as QL
+
+dyn = CC.Phonons.Phonons("dyn_final_", nqirr=3)
+ens = sscha.Ensemble.Ensemble(dyn, 300)
+ens.load_bin("ensemble_dir", 1)
+
+qlanc = QL.QSpaceLanczos(ens)
+qlanc.init(use_symmetries=True)
+
+# Loop over modes at Gamma
+iq = 0
+for band in range(qlanc.n_bands):
+    # Skip acoustic modes (zero frequency)
+    if qlanc.w_q[band, iq] < 1e-6:
+        continue
+
+    qlanc.prepare_mode_q(iq, band)
+    qlanc.run_FT(100)
+    qlanc.save_status(f"qspace_iq{iq}_band{band}.npz")
+```
+
+### Saving and Loading
+
+Checkpoint saving and loading works the same as the standard Lanczos:
+
+```python
+# Save during calculation
+qlanc.run_FT(100, save_each=10, save_dir="output", prefix="qlanc")
+
+# Save final result
+qlanc.save_status("qspace_result.npz")
+```
+
+!!! note "Restart limitations"
+    Restarting from a saved checkpoint currently requires re-creating the `QSpaceLanczos` object from the same ensemble, loading the status, and then continuing. The q-space internal state (pair maps, symmetries) is reconstructed from the ensemble on `init()`.
+
+### When to Use Q-Space vs Real-Space Lanczos
+
+| Feature | Real-space (`Lanczos`) | Q-space (`QSpaceLanczos`) |
+|---------|----------------------|--------------------------|
+| Backend | C, MPI, or Julia | Julia only |
+| Representation | Real or Wigner | Wigner only |
+| Perturbation | Supercell mode index | (q-point, band) index |
+| Symmetry handling | Full space group | Point group (translations analytic) |
+| Two-phonon size | $\sim n_\text{modes}^2/2$ | $\sim N_\text{cell} \cdot n_\text{bands}^2/2$ |
+| Best for | Small supercells, $\Gamma$-only | Large supercells, finite-q perturbations |
+
+The Q-space approach is most advantageous when:
+
+- The supercell is large (large $N_\text{cell}$), since the speedup scales as $\sim N_\text{cell}$
+- You want to study phonon spectral functions at specific q-points
+- Julia is available and configured
+
+For $\Gamma$-point-only calculations on small supercells, the standard Lanczos with `gamma_only=True` may be comparable or faster due to lower overhead.
+
+
 ## Advanced Features
 
 ### Custom Perturbations
diff --git a/meson.build b/meson.build
index 21571f4b..eff79f45 100644
--- a/meson.build
+++ b/meson.build
@@ -1,6 +1,6 @@
 project('tdscha',
   ['c'],
-  version: '1.6.0',
+  version: '1.6.1',
   license: 'GPL',
   meson_version: '>= 1.1.0', # <- set min version of meson.
   default_options : [
@@ -121,11 +121,13 @@ py.install_sources(
     [
       'Modules/__init__.py',
       'Modules/DynamicalLanczos.py',
+      'Modules/QSpaceLanczos.py',
       'Modules/Dynamical.py',
       'Modules/Parallel.py',
       'Modules/Perturbations.py',
       'Modules/StaticHessian.py',
       'Modules/tdscha_core.jl',
+      'Modules/tdscha_qspace.jl',
       'Modules/Tools.py',
       'Modules/cli.py'
     ],
diff --git a/mkdocs.yml b/mkdocs.yml
index 82c928b7..bec50210 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -48,6 +48,7 @@ nav:
   - CLI Tools: cli.md
   - API Reference:
     - DynamicalLanczos: api/dynamical_lanczos.md
+    - QSpaceLanczos: api/qspace_lanczos.md
     - StaticHessian: api/static_hessian.md
 
 # Extra JavaScript for MathJax
diff --git a/pyproject.toml b/pyproject.toml
index f5d86368..07835df7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ build-backend = "mesonpy"
 [project]
 # Project metadata, which was previously in the `setup()` call of setup.py.
 name = "tdscha"
-version = "1.6.0" # Make sure this version matches meson.build
+version = "1.6.1" # Make sure this version matches meson.build
 description = "Time Dependent Self Consistent Harmonic Approximation"
 authors = [{name = "Lorenzo Monacelli"}]
 readme = "README.md"
diff --git a/tests/test_lanczos_fast/test_c_code_lanczos/mode10.json b/tests/test_lanczos_fast/test_c_code_lanczos/mode10.json
index ca3d669f..7b80b56c 100644
--- a/tests/test_lanczos_fast/test_c_code_lanczos/mode10.json
+++ b/tests/test_lanczos_fast/test_c_code_lanczos/mode10.json
@@ -1 +1 @@
-{"T": 250, "n_steps": 5, "ignore_v2": false, "ignore_v3": false, "ignore_v4": false, "use_wigner": false, "run_sym": false, "data": {"n_configs": 10, "n_modes": 45, "n_syms": 384, "n_blocks": 9, "perturbation_modulus": 1, "reverse": false, "shift": 0}}
\ No newline at end of file
+{"T": 250, "n_steps": 5, "ignore_v2": false, "ignore_v3": false, "ignore_v4": false, "use_wigner": true, "run_sym": false, "data": {"n_configs": 10, "n_modes": 45, "n_syms": 384, "n_blocks": 9, "perturbation_modulus": 1, "reverse": false, "shift": 0}}
\ No newline at end of file
diff --git a/tests/test_qspace/test_qspace_lanczos.py b/tests/test_qspace/test_qspace_lanczos.py
new file mode 100644
index 00000000..777e6483
--- /dev/null
+++ b/tests/test_qspace/test_qspace_lanczos.py
@@ -0,0 +1,273 @@
+"""
+Test that compares the spectral function computed via Q-space Lanczos
+and real-space Lanczos (both using Wigner mode) on the same SnTe-like system.
+
+The benchmark is on the Green function (real and imaginary parts)
+evaluated at omega=0 and at the frequency of the perturbed mode.
+
+This test perturbs a Gamma-point optical mode. Because the optical modes
+at Gamma are triply degenerate, we must carefully project the supercell
+eigenvector onto q-space bands to get the correct mapping.
+"""
+from __future__ import print_function
+
+import numpy as np
+import os
+import pytest
+
+import cellconstructor as CC
+import cellconstructor.Phonons
+import cellconstructor.Methods
+import cellconstructor.symmetries
+
+import sscha, sscha.Ensemble
+import tdscha.DynamicalLanczos as DL
+
+from tdscha.Parallel import pprint as print
+
+# Test parameters
+N_STEPS = 10
+T = 250
+NQIRR = 3
+
+# Tolerance on the Green function comparison (relative)
+GF_RTOL = 0.01  # 1% relative tolerance
+
+# Data directory (reuse test_julia data)
+DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                        '..', 'test_julia', 'data')
+
+
+def _find_gamma_mode_mapping():
+    """Find the correct mapping between a supercell mode and q-space band at Gamma.
+
+    Because the Gamma optical modes are degenerate, we cannot simply match by
+    frequency — the eigenvector gauge within the degenerate subspace may differ.
+    Instead, we project the supercell eigenvector onto the q-space band basis
+    to find the correct band_index.
+
+    Returns
+    -------
+    mode_index : int
+        Supercell mode index (after translation removal) for the selected mode.
+    band_index : int
+        Matching band index at Gamma (0-based).
+    w_q : ndarray
+        Frequencies at each q-point, shape (n_bands, n_q).
+    pols_q : ndarray
+        Polarization vectors at each q-point, shape (3*nat_uc, n_bands, n_q).
+    """
+    dyn = CC.Phonons.Phonons(os.path.join(DATA_DIR, "dyn_gen_pop1_"), NQIRR)
+    ws_sc, pols_sc, w_q, pols_q = dyn.DiagonalizeSupercell(return_qmodes=True)
+
+    super_structure = dyn.structure.generate_supercell(dyn.GetSupercell())
+    m = super_structure.get_masses_array()  # (nat_sc,)
+    trans_mask = CC.Methods.get_translations(pols_sc, m)
+    good_ws = ws_sc[~trans_mask]
+    orig_indices = np.where(~trans_mask)[0]
+
+    n_bands = 3 * dyn.structure.N_atoms
+    n_cell = np.prod(dyn.GetSupercell())
+    nat_uc = dyn.structure.N_atoms
+    nat_sc = super_structure.N_atoms
+    itau = super_structure.get_itau(dyn.structure) - 1  # 0-indexed
+
+    # Find first non-acoustic Gamma band
+    for band in range(n_bands):
+        if w_q[band, 0] > 1e-6:
+            target_freq = w_q[band, 0]
+            break
+    else:
+        raise ValueError("No non-acoustic Gamma mode found")
+
+    # Find the supercell mode matching this frequency
+    mode_index = np.argmin(np.abs(good_ws - target_freq))
+    freq_diff = abs(good_ws[mode_index] - target_freq)
+    assert freq_diff < 1e-8, (
+        "Mode frequency mismatch: {:.10e} vs {:.10e}".format(
+            good_ws[mode_index], target_freq))
+
+    # Project the supercell eigenvector onto Gamma q-space bands
+    orig_mode = orig_indices[mode_index]
+    pol_sc_mode = pols_sc[:, orig_mode]  # (3*N_sc,)
+
+    # Sum over supercell images to get Gamma component
+    pol_gamma = np.zeros(3 * nat_uc)
+    for i_sc in range(nat_sc):
+        i_uc = itau[i_sc]
+        pol_gamma[3*i_uc:3*i_uc+3] += pol_sc_mode[3*i_sc:3*i_sc+3]
+    pol_gamma /= np.sqrt(n_cell)
+
+    # Project onto q-space bands: R1[nu] = conj(pol_q[:, nu, 0]).T @ pol_gamma
+    R1 = np.conj(pols_q[:, :, 0]).T @ pol_gamma
+    band_index = np.argmax(np.abs(R1))
+
+    assert np.abs(R1[band_index]) > 0.99, (
+        "Projection onto q-space bands is not clean: |R1| = {}, R1 = {}".format(
+            np.abs(R1), R1))
+
+    return mode_index, band_index, w_q, pols_q
+
+
+def _setup_realspace_lanczos(mode_index):
+    """Set up and run a real-space Lanczos calculation (Wigner, serial C)."""
+    dyn = CC.Phonons.Phonons(os.path.join(DATA_DIR, "dyn_gen_pop1_"), NQIRR)
+    ens = sscha.Ensemble.Ensemble(dyn, T)
+    ens.load_bin(DATA_DIR, 1)
+
+    lanc = DL.Lanczos(ens, lo_to_split=None)
+    lanc.ignore_harmonic = False
+    lanc.ignore_v3 = False
+    lanc.ignore_v4 = False
+    lanc.use_wigner = True
+    lanc.mode = DL.MODE_FAST_SERIAL
+    lanc.init(use_symmetries=True)
+    lanc.prepare_mode(mode_index)
+    lanc.run_FT(N_STEPS, run_simm=True, verbose=False)
+    return lanc
+
+
+def _setup_qspace_lanczos(iq, band_index):
+    """Set up and run a Q-space Lanczos calculation."""
+    try:
+        import tdscha.QSpaceLanczos as QL
+    except ImportError:
+        pytest.skip("Julia not available for QSpaceLanczos")
+
+    if not QL.__JULIA_EXT__:
+        pytest.skip("Julia extension not loaded")
+
+    dyn = CC.Phonons.Phonons(os.path.join(DATA_DIR, "dyn_gen_pop1_"), NQIRR)
+    ens = sscha.Ensemble.Ensemble(dyn, T)
+    ens.load_bin(DATA_DIR, 1)
+
+    qlanc = QL.QSpaceLanczos(ens, lo_to_split=None)
+    qlanc.ignore_harmonic = False
+    qlanc.ignore_v3 = False
+    qlanc.ignore_v4 = False
+    qlanc.init(use_symmetries=True)
+    qlanc.prepare_mode_q(iq, band_index)
+    qlanc.run_FT(N_STEPS, verbose=False)
+    return qlanc
+
+
+def test_qspace_vs_realspace_green_function(verbose=False):
+    """
+    Compare Green functions from Q-space and real-space Lanczos
+    at omega=0 and at the perturbed mode frequency.
+
+    The renormalized frequency from 1/Re[G(0)] must agree within tolerance.
+    """
+    try:
+        import tdscha.QSpaceLanczos as QL
+        if not QL.__JULIA_EXT__:
+            pytest.skip("Julia extension not loaded")
+    except ImportError:
+        pytest.skip("Julia not available for QSpaceLanczos")
+
+    mode_index, band_index, w_q, pols_q = _find_gamma_mode_mapping()
+
+    if verbose:
+        print()
+        print("=== Q-space vs real-space Lanczos comparison ===")
+        print("Testing Gamma band {} (freq {:.2f} cm-1)".format(
+            band_index, w_q[band_index, 0] * CC.Units.RY_TO_CM))
+        print("  -> supercell mode index {}".format(mode_index))
+        print()
+
+    # Run both calculations
+    lanc_real = _setup_realspace_lanczos(mode_index)
+    lanc_qspace = _setup_qspace_lanczos(0, band_index)
+
+    # Get the harmonic frequency of the perturbed mode (in Ry)
+    w_mode = lanc_real.w[mode_index]
+
+    if verbose:
+        print("Lanczos coefficients comparison:")
+        print("  a_0 real-space: {:.10e}".format(lanc_real.a_coeffs[0]))
+        print("  a_0 q-space:    {:.10e}".format(lanc_qspace.a_coeffs[0]))
+        print("  b_0 real-space: {:.10e}".format(lanc_real.b_coeffs[0]))
+        print("  b_0 q-space:    {:.10e}".format(lanc_qspace.b_coeffs[0]))
+        print()
+
+    # Small imaginary broadening
+    smearing = w_mode * 0.1
+
+    # Evaluate the Green function at omega=0 and at the mode frequency
+    w_points = np.array([0.0, w_mode])
+
+    gf_real = lanc_real.get_green_function_continued_fraction(
+        w_points, use_terminator=False, smearing=smearing
+    )
+    gf_qspace = lanc_qspace.get_green_function_continued_fraction(
+        w_points, use_terminator=False, smearing=smearing
+    )
+
+    real_real = np.real(gf_real)
+    real_qspace = np.real(gf_qspace)
+    spectral_real = -np.imag(gf_real)
+    spectral_qspace = -np.imag(gf_qspace)
+
+    if verbose:
+        print("At omega = 0:")
+        print("  Re[G] real-space:  {:.10e}".format(real_real[0]))
+        print("  Re[G] q-space:     {:.10e}".format(real_qspace[0]))
+        print("  Im[G] real-space:  {:.10e}".format(spectral_real[0]))
+        print("  Im[G] q-space:     {:.10e}".format(spectral_qspace[0]))
+
+        print()
+        print("At omega = w_mode ({:.2f} cm-1):".format(
+            w_mode * CC.Units.RY_TO_CM))
+        print("  Re[G] real-space:  {:.10e}".format(real_real[1]))
+        print("  Re[G] q-space:     {:.10e}".format(real_qspace[1]))
+        print("  Im[G] real-space:  {:.10e}".format(spectral_real[1]))
+        print("  Im[G] q-space:     {:.10e}".format(spectral_qspace[1]))
+
+    # --- Assertions ---
+    # Renormalized frequency from Re[G(0)]: w^2 = 1/Re[G(0)]
+    w2_real = 1.0 / real_real[0]
+    w2_qspace = 1.0 / real_qspace[0]
+    freq_real = np.sign(w2_real) * np.sqrt(np.abs(w2_real)) * CC.Units.RY_TO_CM
+    freq_qspace = np.sign(w2_qspace) * np.sqrt(np.abs(w2_qspace)) * CC.Units.RY_TO_CM
+
+    if verbose:
+        print()
+        print("Renormalized frequency (real-space):  {:.6f} cm-1".format(freq_real))
+        print("Renormalized frequency (Q-space):     {:.6f} cm-1".format(freq_qspace))
+        print("Difference:                           {:.6f} cm-1".format(
+            abs(freq_real - freq_qspace)))
+
+    assert abs(freq_real - freq_qspace) < 0.1, (
+        "Renormalized frequencies differ too much: "
+        "real-space={:.4f}, q-space={:.4f} cm-1".format(freq_real, freq_qspace)
+    )
+
+    # Compare spectral function at omega = w_mode
+    ref_spectral = max(abs(spectral_real[1]), abs(spectral_qspace[1]))
+    if ref_spectral > 1e-15:
+        rel_diff_spectral = abs(spectral_real[1] - spectral_qspace[1]) / ref_spectral
+        print("Relative diff in spectral function at w_mode: {:.6e}".format(
+            rel_diff_spectral))
+        assert rel_diff_spectral < GF_RTOL, (
+            "Spectral functions at w_mode differ by {:.4e} "
+            "(tolerance: {})".format(rel_diff_spectral, GF_RTOL)
+        )
+
+    # Compare real part of GF at omega = w_mode
+    ref_real_wm = max(abs(real_real[1]), abs(real_qspace[1]))
+    if ref_real_wm > 1e-15:
+        rel_diff_real = abs(real_real[1] - real_qspace[1]) / ref_real_wm
+        print("Relative diff in Re[G] at w_mode:              {:.6e}".format(
+            rel_diff_real))
+        assert rel_diff_real < GF_RTOL, (
+            "Real parts of GF at w_mode differ by {:.4e} "
+            "(tolerance: {})".format(rel_diff_real, GF_RTOL)
+        )
+
+    if verbose:
+        print()
+        print("=== All benchmarks passed ===")
+
+
+if __name__ == "__main__":
+    test_qspace_vs_realspace_green_function(verbose=True)