huggingface · JJJYmmm · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -894,6 +894,8 @@
         title: Zamba
       - local: model_doc/zamba2
         title: Zamba2
+      - local: model_doc/zaya
+        title: ZAYA
       title: Text models
     - sections:
       - local: model_doc/aimv2

diff --git a/docs/source/en/model_doc/zaya.md b/docs/source/en/model_doc/zaya.md
@@ -0,0 +1,63 @@
+<!--Copyright 2026 the HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-05-06 and added to Hugging Face Transformers on 2026-05-26.*
+
+# ZAYA
+
+## Overview
+
+ZAYA1 is a 760M active / 8.4B total parameter MoE language model trained by Zyphra. It combines Compressed
+Convolutional Attention (CCA), a nonlinear ZAYA1 router, and residual scaling.
+
+ZAYA1 uses the Gemma 3 tokenizer. For more details, see the [ZAYA1 model card](https://huggingface.co/Zyphra/ZAYA1-8B)
+and Zyphra's technical reports.
+
+This model was contributed by [JJJYmmm](https://github.com/JJJYmmm).
+
+## Usage examples
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "Zyphra/ZAYA1-8B"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
+
+inputs = tokenizer.apply_chat_template(
+    [{"role": "user", "content": "Write a haiku about recursion in programming."}],
+    tokenize=True,
+    add_generation_prompt=True,
+    enable_thinking=False,
+    return_tensors="pt",
+)
+inputs = inputs.to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=256)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+## ZayaConfig
+
+[[autodoc]] ZayaConfig
+
+## ZayaModel
+
+[[autodoc]] ZayaModel
+    - forward
+
+## ZayaForCausalLM
+
+[[autodoc]] ZayaForCausalLM
+    - forward
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -864,6 +864,33 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
         DynamicLayer.reorder_cache(self, beam_idx)
 
 
+class LinearAttentionAndSlidingWindowAttentionLayer(LinearAttentionLayer, DynamicSlidingWindowLayer):
+    # The dynamic sliding attention part makes it non-compileable
+    is_compileable = False
+
+    def __init__(self, config: PreTrainedConfig | None = None):
+        DynamicSlidingWindowLayer.__init__(self, config)
+        LinearAttentionLayer.__init__(self)
+
+    def lazy_initialization(self, *args, **kwargs) -> None:
+        # When the Attention cache is used with `update`, `lazy_initialization` is called with 2 positional args
+        if len(args) == 2 and len(kwargs) == 0:
+            DynamicSlidingWindowLayer.lazy_initialization(self, *args)
+        # Otherwise, for the LinearAttention cache, when it's called in `update_conv_state` or `update_recurrent_state`,
+        # it's always called with 1 single kwarg (cause it needs to know if it's for the conv or ssm states)
+        if len(args) == 0 and len(kwargs) == 1:
+            LinearAttentionLayer.lazy_initialization(self, **kwargs)
+
+    def reset(self) -> None:
+        LinearAttentionLayer.reset(self)
+        DynamicSlidingWindowLayer.reset(self)
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        LinearAttentionLayer.reorder_cache(self, beam_idx)
+        DynamicSlidingWindowLayer.reorder_cache(self, beam_idx)
+
+
 # Pre-register the standard layer types (some classes are shared between multiple types,
 # e.g. ``DynamicSlidingWindowLayer`` covers both ``"sliding_attention"`` and
 # ``"chunked_attention"`` — those need an explicit map entry rather than the
@@ -883,6 +910,7 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
         "moe": LinearAttentionLayer,
         # Hybrid layers (e.g. zamba / zamba2) carry both a linear-attention state and a dynamic-attention state.
         "hybrid": LinearAttentionAndFullAttentionLayer,
+        "hybrid_sliding": LinearAttentionAndSlidingWindowAttentionLayer,
     }
 )
 

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -71,7 +71,8 @@
     "attention",
     "sparse",
     "dense",
-    "hybrid",  # for layers that have both mamba and attention in zamba and zamba2
+    "hybrid",  # for zamba/zamba2/zaya1, which use full attention + conv states
+    "hybrid_sliding",  # for zaya1, which uses swa + conv states
     "moe",  # for nemotron_h, which uses either attention, mamba or moe
 )
 

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -485,6 +485,7 @@
     from .youtu import *
     from .zamba import *
     from .zamba2 import *
+    from .zaya import *
     from .zoedepth import *
 else:
     import sys

diff --git a/src/transformers/models/auto/auto_mappings.py b/src/transformers/models/auto/auto_mappings.py
@@ -651,6 +651,7 @@
         ("youtu", "YoutuConfig"),
         ("zamba", "ZambaConfig"),
         ("zamba2", "Zamba2Config"),
+        ("zaya", "ZayaConfig"),
         ("zoedepth", "ZoeDepthConfig"),
     ]
 )

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -519,6 +519,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("youtu", "YoutuModel"),
         ("zamba", "ZambaModel"),
         ("zamba2", "Zamba2Model"),
+        ("zaya", "ZayaModel"),
     ]
 )
 
@@ -783,6 +784,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("youtu", "YoutuForCausalLM"),
         ("zamba", "ZambaForCausalLM"),
         ("zamba2", "Zamba2ForCausalLM"),
+        ("zaya", "ZayaForCausalLM"),
     ]
 )
 

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -342,6 +342,7 @@
         ("xlstm", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
         ("xmod", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
         ("yoso", "AlbertTokenizer" if is_tokenizers_available() else None),
+        ("zaya", "GemmaTokenizer" if is_tokenizers_available() else None),
     ]
 )
 

diff --git a/src/transformers/models/zaya/__init__.py b/src/transformers/models/zaya/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2026 Zyphra and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_zaya import *
+    from .modeling_zaya import *
+
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/zaya/configuration_zaya.py b/src/transformers/models/zaya/configuration_zaya.py
@@ -0,0 +1,129 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/zaya/modular_zaya.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_zaya.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 Zyphra and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Literal
+
+from huggingface_hub.dataclasses import strict
+
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring
+
+
+@auto_docstring(checkpoint="Zyphra/ZAYA1-8B")
+@strict
+class ZayaConfig(PreTrainedConfig):
+    r"""
+    lm_head_bias (`bool`, *optional*, defaults to `False`):
+        Whether to add a bias to the language modeling head.
+    router_hidden_size (`int`, *optional*, defaults to 256):
+        Hidden size used by the ZAYA router.
+    cca_time0 (`int`, *optional*, defaults to 2):
+        First temporal parameter of the CCA projection.
+    cca_time1 (`int`, *optional*, defaults to 2):
+        Second temporal parameter of the CCA projection.
+
+    ```python
+    >>> from transformers import ZayaConfig, ZayaModel
+
+    >>> configuration = ZayaConfig()
+    >>> model = ZayaModel(configuration)
+
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "zaya"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_fsdp_plan = {
+        "embed_tokens": "free_full_weight",
+        "layers.*": "free_full_weight",
+        "norm": "keep_full_weight",
+    }
+
+    vocab_size: int = 262272
+    hidden_size: int = 2048
+    num_hidden_layers: int = 40
+    num_attention_heads: int = 8
+    num_key_value_heads: int = 2
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 131072
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-5
+    use_cache: bool = True
+    tie_word_embeddings: bool = True
+    rope_parameters: RopeParameters | dict | None = None
+    sliding_window: int | None = None
+    attention_dropout: float | int = 0.0
+    moe_intermediate_size: int = 2048
+
+    num_experts_per_tok: int = 1
+    num_experts: int = 16
+    output_router_logits: bool = False
+    layer_types: list[str] | None = None
+    pad_token_id: int | None = 0
+    bos_token_id: int | None = 2
+    eos_token_id: int | list[int] | None = 106
+
+    # Zaya-specific attention
+    head_dim: int = 128
+    attention_bias: bool = False
+
+    lm_head_bias: bool = False
+    router_hidden_size: int = 256
+    cca_time0: int = 2
+    cca_time1: int = 2
+
+    def __post_init__(self, **kwargs):
+        self.layer_types = ["hybrid"] * self.num_hidden_layers if self.layer_types is None else list(self.layer_types)
+
+        default_rope_params: dict[Literal["hybrid", "hybrid_sliding"], dict[str, Any]] = {
+            "hybrid": {
+                "rope_type": "default",
+                "rope_theta": 5_000_000.0,
+                "partial_rotary_factor": 0.5,
+            },
+            "hybrid_sliding": {
+                "rope_type": "default",
+                "rope_theta": 10_000.0,
+                "partial_rotary_factor": 0.5,
+            },
+        }
+        if self.rope_parameters is None:
+            self.rope_parameters = default_rope_params
+
+        super().__post_init__(**kwargs, ignore_keys_at_rope_validation={"hybrid", "hybrid_sliding"})
+
+    def convert_rope_params_to_dict(self, **kwargs):
+        # No legacy flat RoPE format is supported here; conversion writes the nested ZAYA layer-type format directly.
+        return kwargs
+
+    def validate_architecture(self):
+        """Part of ``@strict``-powered validation."""
+        if self.num_experts_per_tok != 1:
+            raise ValueError("ZAYA currently supports `num_experts_per_tok=1` only.")
+        if self.num_attention_heads % self.num_key_value_heads != 0:
+            raise ValueError("`num_attention_heads` must be a multiple of `num_key_value_heads`.")
+        if "hybrid_sliding" in self.layer_types and self.sliding_window is None:
+            raise ValueError("`sliding_window` must be set when `layer_types` contains `hybrid_sliding`.")
+
+
+__all__ = ["ZayaConfig"]