Visual-Representation-Learning-JEPA/models.py at main · Alpsource/Visual-Representation-Learning-JEPA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
"""
models.py  —  I-JEPA architecture for STL-10

Components
----------
PatchEmbed          : image → sequence of patch tokens
TransformerBlock    : standard ViT block (attention + MLP)
ContextEncoder      : heavyweight ViT encoder (processes visible context patches)
TargetEncoder       : identical architecture; weights are EMA of ContextEncoder
Predictor           : narrow ViT conditioned on positional mask tokens
IJEPA               : complete model with multi-block masking strategy

Refactorings:
- Fixed 2D sincos positional embeddings (was missing column information).
- Vectorized Predictor forward pass (batch processing B*M sequences).
- Vectorized IJEPA loss calculation.
- Added target normalization (LayerNorm) to prevent representation collapse.
- Used torch.nn.functional.scaled_dot_product_attention for speed and efficiency.
"""

import math
import torch
import torch.nn as nn
import torch.nn.functional as F


# ---------------------------------------------------------------------------
# Patch embedding
# ---------------------------------------------------------------------------

class PatchEmbed(nn.Module):
    """Divide image into non-overlapping patches and project to embed_dim."""

    def __init__(self, img_size: int = 96, patch_size: int = 8, in_chans: int = 3,
                 embed_dim: int = 384):
        super().__init__()
        self.img_size   = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        self.grid_size   = img_size // patch_size

        self.proj = nn.Conv2d(in_chans, embed_dim,
                              kernel_size=patch_size, stride=patch_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, C, H, W) → (B, num_patches, embed_dim)
        x = self.proj(x)                        # (B, E, G, G)
        x = x.flatten(2).transpose(1, 2)        # (B, N, E)
        return x


# ---------------------------------------------------------------------------
# Transformer building blocks
# ---------------------------------------------------------------------------

class Attention(nn.Module):
    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = True,
                 attn_drop: float = 0.0, proj_drop: float = 0.0):
        super().__init__()
        self.num_heads = num_heads
        self.qkv  = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(dim, dim)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)                 # (B, H, N, D)

        # mask: (B, N) bool, True means "padding" (do NOT attend)
        # SDPA boolean convention: True = attend, False = block.  Must invert.
        attn_mask = mask
        if attn_mask is not None:
            if attn_mask.dim() == 2:
                attn_mask = ~attn_mask.view(B, 1, 1, N)  # True=attend

        # Faster attention using SDPA (available in Torch 2.0+)
        x = F.scaled_dot_product_attention(
            q, k, v,
            attn_mask=attn_mask,
            dropout_p=self.attn_drop.p if self.training else 0.0
        )

        x = x.transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class MLP(nn.Module):
    def __init__(self, in_features: int, hidden_features: int, drop: float = 0.0):
        super().__init__()
        self.fc1  = nn.Linear(in_features, hidden_features)
        self.act  = nn.GELU()
        self.fc2  = nn.Linear(hidden_features, in_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class TransformerBlock(nn.Module):
    def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 4.0,
                 qkv_bias: bool = True, drop: float = 0.0, attn_drop: float = 0.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn  = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias,
                               attn_drop=attn_drop, proj_drop=drop)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp   = MLP(dim, int(dim * mlp_ratio), drop=drop)

    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        x = x + self.attn(self.norm1(x), mask=mask)
        x = x + self.mlp(self.norm2(x))
        return x


# ---------------------------------------------------------------------------
# Sinusoidal 2-D positional embedding (Fixed)
# ---------------------------------------------------------------------------

def get_2d_sincos_pos_embed(embed_dim: int, grid_size: int) -> torch.Tensor:
    """Standard 2D sincos pos embed."""
    grid_h = torch.arange(grid_size, dtype=torch.float32)
    grid_w = torch.arange(grid_size, dtype=torch.float32)
    grid_h, grid_w = torch.meshgrid(grid_h, grid_w, indexing='ij')
    grid = torch.stack([grid_h, grid_w], dim=0)  # (2, G, G)

    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    return pos_embed

def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0
    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (G*G, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (G*G, D/2)
    emb = torch.cat([emb_h, emb_w], dim=1) # (G*G, D)
    return emb

def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    assert embed_dim % 2 == 0
    omega = torch.arange(embed_dim // 2, dtype=torch.float32)
    omega /= embed_dim / 2.
    omega = 1. / (10000**omega)  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = torch.einsum('m,d->md', pos, omega)  # (M, D/2)

    emb_sin = torch.sin(out) # (M, D/2)
    emb_cos = torch.cos(out) # (M, D/2)

    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
    return emb


# ---------------------------------------------------------------------------
# Multi-block masking  (I-JEPA §3)
# ---------------------------------------------------------------------------

class MultiBlockMasking:
    """
    Samples context and target block masks per the I-JEPA recipe.

    Target blocks: M blocks, each random scale in (target_scale_min, target_scale_max)
                   and aspect ratio in (aspect_min, aspect_max).
    Context block: 1 block with large scale (context_scale_min, 1.0).
                   Target-overlapping regions are removed from context.
    """

    def __init__(
        self,
        grid_size: int = 12,          # number of patches per side
        num_targets: int = 4,
        target_scale_min: float = 0.15,
        target_scale_max: float = 0.20,
        context_scale_min: float = 0.85,
        context_scale_max: float = 1.0,
        aspect_min: float = 0.75,
        aspect_max: float = 1.50,
    ):
        self.grid_size         = grid_size
        self.num_targets       = num_targets
        self.target_scale_min  = target_scale_min
        self.target_scale_max  = target_scale_max
        self.context_scale_min = context_scale_min
        self.context_scale_max = context_scale_max
        self.aspect_min        = aspect_min
        self.aspect_max        = aspect_max
        self.num_patches       = grid_size * grid_size

    # ------------------------------------------------------------------
    def _sample_block(self, scale_min: float, scale_max: float,
                      fix_aspect: bool = False) -> torch.Tensor:
        """Return a boolean mask of shape (num_patches,) for one block."""
        area   = self.num_patches * torch.empty(1).uniform_(scale_min, scale_max).item()
        aspect = 1.0 if fix_aspect else torch.empty(1).uniform_(
            self.aspect_min, self.aspect_max).item()

        h = max(1, min(self.grid_size, int(round(math.sqrt(area / aspect)))))
        w = max(1, min(self.grid_size, int(round(math.sqrt(area * aspect)))))

        top  = torch.randint(0, self.grid_size - h + 1, (1,)).item()
        left = torch.randint(0, self.grid_size - w + 1, (1,)).item()

        mask = torch.zeros(self.grid_size, self.grid_size, dtype=torch.bool)
        mask[top:top + h, left:left + w] = True
        return mask.flatten()

    # ------------------------------------------------------------------
    def __call__(self, batch_size: int, device: torch.device):
        """
        Returns
        -------
        context_masks  : list[Tensor]  len=batch_size, each (Nc,) patch indices
        target_masks   : list[list[Tensor]]  [B][M] each (Nt_i,) patch indices
        """
        context_masks = []
        target_masks  = []

        for _ in range(batch_size):
            # ---- sample M target blocks ----
            all_targets_bool = torch.zeros(self.num_patches, dtype=torch.bool)
            per_target = []
            for _ in range(self.num_targets):
                tb = self._sample_block(self.target_scale_min, self.target_scale_max)
                per_target.append(tb.nonzero(as_tuple=False).squeeze(1))
                all_targets_bool |= tb

            # ---- sample context block ----
            cb = self._sample_block(self.context_scale_min, self.context_scale_max,
                                    fix_aspect=True)
            # remove overlapping target regions
            cb = cb & (~all_targets_bool)

            context_masks.append(cb.nonzero(as_tuple=False).squeeze(1).to(device))
            target_masks.append([t.to(device) for t in per_target])

        return context_masks, target_masks


# ---------------------------------------------------------------------------
# Context Encoder  (heavyweight ViT, processes visible context patches)
# ---------------------------------------------------------------------------

class ContextEncoder(nn.Module):
    def __init__(
        self,
        img_size: int   = 96,
        patch_size: int = 8,
        in_chans: int   = 3,
        embed_dim: int  = 384,
        depth: int      = 6,
        num_heads: int  = 6,
        mlp_ratio: float = 4.0,
    ):
        super().__init__()
        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
        grid_size   = self.patch_embed.grid_size

        # sinusoidal positional embeddings — not trainable
        pos_emb = get_2d_sincos_pos_embed(embed_dim, grid_size)
        self.register_buffer('pos_embed', pos_emb.unsqueeze(0))  # (1, N, E)

        self.blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, mlp_ratio)
            for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.embed_dim = embed_dim

    def forward(self, x: torch.Tensor,
                context_masks: list[torch.Tensor]) -> list[torch.Tensor]:
        """
        Parameters
        ----------
        x             : (B, C, H, W)
        context_masks : list of (Nc,) integer index tensors  len = B

        Returns
        -------
        list of (Nc, E) tensors — one per sample
        """
        tokens = self.patch_embed(x)                   # (B, N, E)
        tokens = tokens + self.pos_embed               # add positional info

        # gather visible context tokens per sample
        # Each sample has its own context mask so lengths can differ across the batch.
        ctx_tokens = []
        for i, idx in enumerate(context_masks):
            ctx_tokens.append(tokens[i][idx])          # (Nc, E)

        # Pad variable-length sequences to the same length so they can be stacked
        # into a single tensor for efficient batch processing.
        # mask[i, j] = True  → position j is padding for sample i (do NOT attend)
        # mask[i, j] = False → position j is a real context token
        lengths = [t.shape[0] for t in ctx_tokens]
        max_len = max(lengths)
        B, E    = len(ctx_tokens), self.embed_dim
        padded  = tokens.new_zeros(B, max_len, E)
        mask    = torch.ones(B, max_len, dtype=torch.bool, device=x.device) # True = padding
        for i, t in enumerate(ctx_tokens):
            padded[i, :lengths[i]] = t
            mask[i, :lengths[i]] = False

        # transformer layers
        h = padded
        for blk in self.blocks:
            h = blk(h, mask=mask)
        h = self.norm(h)

        # unpad
        out = [h[i, :lengths[i]] for i in range(B)]
        return out


# ---------------------------------------------------------------------------
# Target Encoder  (same arch; weights updated via EMA of context encoder)
# ---------------------------------------------------------------------------

class TargetEncoder(nn.Module):
    """
    Identical architecture to ContextEncoder.
    Processes the FULL image and returns all patch representations.
    Weights are updated via EMA — never receives gradients directly.
    """

    def __init__(
        self,
        img_size: int   = 96,
        patch_size: int = 8,
        in_chans: int   = 3,
        embed_dim: int  = 384,
        depth: int      = 6,
        num_heads: int  = 6,
        mlp_ratio: float = 4.0,
    ):
        super().__init__()
        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
        grid_size = self.patch_embed.grid_size

        pos_emb = get_2d_sincos_pos_embed(embed_dim, grid_size)
        self.register_buffer('pos_embed', pos_emb.unsqueeze(0))

        self.blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, mlp_ratio)
            for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)

    @torch.no_grad()
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Returns (B, N, E) — all patch representations."""
        tokens = self.patch_embed(x) + self.pos_embed
        for blk in self.blocks:
            tokens = blk(tokens)
        return self.norm(tokens)


# ---------------------------------------------------------------------------
# Predictor  (narrow ViT, conditioned on positional mask tokens)
# ---------------------------------------------------------------------------

class Predictor(nn.Module):
    """
    Maps context encoder output → predicted target block representations.

    Architecture: narrow ViT (small embed_dim) conditioned on mask tokens
    that carry the positional embedding of each patch to be predicted.
    """

    def __init__(
        self,
        context_dim: int  = 384,   # encoder embed_dim
        pred_dim: int     = 192,   # predictor width (narrower than encoder)
        output_dim: int   = 384,   # must match context_dim (target repr dim)
        depth: int        = 4,
        num_heads: int    = 6,
        mlp_ratio: float  = 4.0,
        img_size: int     = 96,
        patch_size: int   = 8,
    ):
        super().__init__()
        self.pred_dim   = pred_dim
        self.output_dim = output_dim

        grid_size = img_size // patch_size
        pos_emb   = get_2d_sincos_pos_embed(context_dim, grid_size)
        self.register_buffer('full_pos_embed', pos_emb)  # (N, E_ctx)

        # Project context tokens from encoder space (768-d) into the narrower predictor space (384-d).
        self.ctx_proj = nn.Linear(context_dim, pred_dim)

        # Shared learnable mask token — a single vector that stands in for every
        # unknown target patch before positional information is added.
        self.mask_token = nn.Parameter(torch.zeros(1, 1, pred_dim))
        nn.init.trunc_normal_(self.mask_token, std=0.02)

        # Project the sinusoidal position embedding (context_dim) into predictor space (pred_dim).
        # This injects positional information into each mask token so the predictor
        # knows *where* it must predict, not just *that* it must predict something.
        self.pos_proj = nn.Linear(context_dim, pred_dim)

        self.blocks = nn.ModuleList([
            TransformerBlock(pred_dim, num_heads, mlp_ratio)
            for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(pred_dim)

        # project back to target representation space
        self.output_proj = nn.Linear(pred_dim, output_dim)

    def forward(
        self,
        ctx_out: list[torch.Tensor],         # list[B] of (Nc, E_ctx)
        target_masks: list[list[torch.Tensor]],  # [B][M] of (Nt,) indices
        device: torch.device,
    ) -> list[list[torch.Tensor]]:           # [B][M] of (Nt, E_out)
        """
        Vectorized forward pass processing all samples and target blocks in one batch.
        """
        B = len(ctx_out)
        M = len(target_masks[0])

        all_seq = []
        lengths_ctx = []
        lengths_tgt = []

        for i in range(B):
            ctx_i = self.ctx_proj(ctx_out[i]) # (Nc, D)
            Nc = ctx_i.shape[0]
            for m in range(M):
                t_idx = target_masks[i][m]
                Nt = t_idx.shape[0]

                mask_tok = self.mask_token[0].expand(Nt, -1) # (Nt, D) — same learnable vector for all targets
                pos_emb  = self.pos_proj(self.full_pos_embed[t_idx]) # (Nt, D) — where to predict
                tgt_tok  = mask_tok + pos_emb # (Nt, D) — "predict this unknown patch at this location"

                # Sequence layout: [context tokens | target mask tokens]
                # The transformer attends across the full sequence so target tokens
                # can cross-attend to context and to each other.
                seq = torch.cat([ctx_i, tgt_tok], dim=0) # (Nc+Nt, D)
                all_seq.append(seq)
                lengths_ctx.append(Nc)
                lengths_tgt.append(Nt)

        # Pad all sequences for batch processing
        max_len = max(len(s) for s in all_seq)
        padded_seq = all_seq[0].new_zeros(B * M, max_len, self.pred_dim)
        mask = torch.ones(B * M, max_len, dtype=torch.bool, device=device) # True = masked
        for idx, s in enumerate(all_seq):
            padded_seq[idx, :len(s)] = s
            mask[idx, :len(s)] = False

        h = padded_seq
        for blk in self.blocks:
            h = blk(h, mask=mask)
        h = self.norm(h)
        h = self.output_proj(h) # (B*M, max_len, E_out)

        # Extract predictions back into nested list structure
        res = [[None] * M for _ in range(B)]
        for idx in range(B * M):
            i, m = divmod(idx, M)
            Nc = lengths_ctx[idx]
            Nt = lengths_tgt[idx]
            res[i][m] = h[idx, Nc : Nc + Nt]

        return res


# ---------------------------------------------------------------------------
# Complete I-JEPA model
# ---------------------------------------------------------------------------

class IJEPA(nn.Module):
    """
    Image-based Joint-Embedding Predictive Architecture.
    """

    def __init__(
        self,
        img_size: int    = 96,
        patch_size: int  = 8,
        in_chans: int    = 3,
        encoder_dim: int = 384,
        encoder_depth: int  = 6,
        encoder_heads: int  = 6,
        predictor_dim: int  = 192,
        predictor_depth: int = 4,
        predictor_heads: int = 6,
        mlp_ratio: float = 4.0,
        # masking hyper-parameters
        num_targets: int          = 4,
        target_scale_min: float   = 0.15,
        target_scale_max: float   = 0.20,
        context_scale_min: float  = 0.85,
        context_scale_max: float  = 1.00,
        aspect_min: float         = 0.75,
        aspect_max: float         = 1.50,
        # EMA momentum
        ema_momentum: float       = 0.996,
        ema_momentum_final: float = 1.000,
    ):
        super().__init__()

        grid_size   = img_size // patch_size

        # ---- encoders ----
        enc_kwargs = dict(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans,
            embed_dim=encoder_dim, depth=encoder_depth,
            num_heads=encoder_heads, mlp_ratio=mlp_ratio,
        )
        self.context_encoder = ContextEncoder(**enc_kwargs)
        self.target_encoder  = TargetEncoder(**enc_kwargs)

        # initialise target encoder = context encoder, no grad
        self._copy_encoder_weights()
        for p in self.target_encoder.parameters():
            p.requires_grad_(False)

        # ---- predictor ----
        self.predictor = Predictor(
            context_dim=encoder_dim,
            pred_dim=predictor_dim,
            output_dim=encoder_dim,
            depth=predictor_depth,
            num_heads=predictor_heads,
            mlp_ratio=mlp_ratio,
            img_size=img_size,
            patch_size=patch_size,
        )

        # ---- masking strategy ----
        self.masking = MultiBlockMasking(
            grid_size=grid_size,
            num_targets=num_targets,
            target_scale_min=target_scale_min,
            target_scale_max=target_scale_max,
            context_scale_min=context_scale_min,
            context_scale_max=context_scale_max,
            aspect_min=aspect_min,
            aspect_max=aspect_max,
        )

        self.ema_momentum       = ema_momentum
        self.ema_momentum_final = ema_momentum_final
        self.encoder_dim        = encoder_dim

        # ---- initialization ----
        self.apply(self._init_weights)
        # initialise target encoder = context encoder, no grad
        self._copy_encoder_weights()
        for p in self.target_encoder.parameters():
            p.requires_grad_(False)

    # ------------------------------------------------------------------
    # Weight initialisation helpers
    # ------------------------------------------------------------------

    def _init_weights(self, m: nn.Module):
        if isinstance(m, nn.Linear):
            nn.init.trunc_normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.ones_(m.weight)
            nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Conv2d):
            nn.init.trunc_normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)

    def _copy_encoder_weights(self):
        """Hard-copy context encoder weights into target encoder."""
        for p_c, p_t in zip(self.context_encoder.parameters(),
                             self.target_encoder.parameters()):
            p_t.data.copy_(p_c.data)

    # ------------------------------------------------------------------
    # EMA update
    # ------------------------------------------------------------------

    @torch.no_grad()
    def update_target_encoder(self, momentum: float | None = None):
        """
        Exponential Moving Average update: θ_t ← m·θ_t + (1−m)·θ_c

        High momentum (close to 1) means the target encoder changes slowly,
        providing stable prediction targets and preventing collapse.
        momentum is annealed from 0.996 → 1.0 so the target encoder
        freezes completely by the end of training.
        """
        m = momentum if momentum is not None else self.ema_momentum
        for p_c, p_t in zip(self.context_encoder.parameters(),
                             self.target_encoder.parameters()):
            p_t.data.mul_(m).add_((1.0 - m) * p_c.data)

    def get_current_momentum(self, step: int, total_steps: int) -> float:
        """Linearly anneal EMA momentum from ema_momentum → ema_momentum_final."""
        return self.ema_momentum + (self.ema_momentum_final - self.ema_momentum) * (
            step / max(total_steps - 1, 1)
        )

    # ------------------------------------------------------------------
    # Forward pass — returns scalar loss
    # ------------------------------------------------------------------

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Returns scalar loss — average L2 distance over all predicted target patches
        """
        B      = x.shape[0]
        device = x.device

        # 1. sample masks
        context_masks, target_masks = self.masking(B, device)
        M = len(target_masks[0])

        # 2. target representations  (no gradient)
        with torch.no_grad():
            target_repr = self.target_encoder(x)   # (B, N, E)

            # Normalize target representations per-token (no learned affine params).
            # This prevents representation collapse: without it the target encoder
            # can minimise the loss trivially by making all embeddings near-zero.
            # The predictor must then match non-trivially normalised targets.
            target_repr = F.layer_norm(target_repr, (target_repr.size(-1),), weight=None, bias=None)

            # extract target patches for each block
            target_list = []                        # [B][M] of (Nt, E)
            for i in range(B):
                per_target = []
                for m in range(len(target_masks[i])):
                    t_idx = target_masks[i][m]
                    per_target.append(target_repr[i][t_idx])   # (Nt, E)
                target_list.append(per_target)

        # 3. context encoder
        ctx_out = self.context_encoder(x, context_masks)    # list[B] of (Nc, E)

        # 4. predictor
        predictions = self.predictor(ctx_out, target_masks, device)  # [B][M] (Nt, E)

        # 5. L2 loss: mean squared error averaged over all elements
        # (batch, target patches, embedding dim).  Gives values ~2.0 initially
        # for normalised representations, converging toward 0 during training.
        all_preds   = torch.cat([torch.cat(p, dim=0) for p in predictions], dim=0)
        all_targets = torch.cat([torch.cat(t, dim=0) for t in target_list], dim=0)

        loss = (all_preds - all_targets).pow(2).mean()
        return loss


# ---------------------------------------------------------------------------
# Linear Classifier  (for frozen-encoder evaluation)
# ---------------------------------------------------------------------------

class LinearClassifier(nn.Module):
    """Attaches a single linear head to a frozen TargetEncoder."""

    def __init__(self, encoder: TargetEncoder, num_classes: int = 10):
        super().__init__()
        self.encoder = encoder
        for p in self.encoder.parameters():
            p.requires_grad_(False)

        embed_dim = encoder.blocks[0].attn.qkv.in_features
        self.head = nn.Linear(embed_dim, num_classes)

        # initialization
        nn.init.trunc_normal_(self.head.weight, std=0.01)
        nn.init.zeros_(self.head.bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            feat = self.encoder(x)           # (B, N, E) — all patch tokens
            feat = feat.mean(dim=1)          # global average pool → (B, E)
            # No CLS token in this ViT: averaging all patch tokens is the
            # standard way to obtain a single image-level representation.
        return self.head(feat)