huggingface · JJJYmmm · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -883,6 +883,8 @@
         title: Zamba
       - local: model_doc/zamba2
         title: Zamba2
+      - local: model_doc/zaya
+        title: ZAYA
       title: Text models
     - sections:
       - local: model_doc/aimv2
@@ -1379,6 +1381,8 @@
         title: Qwen3VL
       - local: model_doc/qwen3_vl_moe
         title: Qwen3VLMoe
+      - local: model_doc/zaya1_vl
+        title: ZAYA1-VL
       - local: model_doc/sam3
         title: SAM3
       - local: model_doc/sam3_video

diff --git a/docs/source/en/model_doc/zaya.md b/docs/source/en/model_doc/zaya.md
@@ -0,0 +1,63 @@
+<!--Copyright 2026 the HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-05-06 and added to Hugging Face Transformers on 2026-05-16.*
+
+# ZAYA
+
+## Overview
+
+ZAYA1 is a 760M active / 8.4B total parameter MoE language model trained by Zyphra. It combines Compressed
+Convolutional Attention (CCA), a nonlinear ZAYA1 router, and residual scaling.
+
+ZAYA1 uses the Gemma 3 tokenizer. For more details, see the [ZAYA1 model card](https://huggingface.co/Zyphra/ZAYA1-8B)
+and Zyphra's technical reports.
+
+This model was contributed by [JJJYmmm](https://github.com/JJJYmmm).
+
+## Usage examples
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "Zyphra/ZAYA1-8B"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
+
+inputs = tokenizer.apply_chat_template(
+    [{"role": "user", "content": "Write a haiku about recursion in programming."}],
+    tokenize=True,
+    add_generation_prompt=True,
+    enable_thinking=False,
+    return_tensors="pt",
+)
+inputs = inputs.to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=256)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+## ZayaConfig
+
+[[autodoc]] ZayaConfig
+
+## ZayaModel
+
+[[autodoc]] ZayaModel
+    - forward
+
+## ZayaForCausalLM
+
+[[autodoc]] ZayaForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/zaya1_vl.md b/docs/source/en/model_doc/zaya1_vl.md
@@ -0,0 +1,95 @@
+<!--Copyright 2026 the HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-05-08 and added to Hugging Face Transformers on 2026-05-15.*
+
+# ZAYA1-VL
+
+## Overview
+
+ZAYA1-VL is a vision-language model from Zyphra built on top of the ZAYA1 text decoder and the Qwen2.5-VL vision
+encoder. It adds vision-token-specific LoRA parameters in the text decoder and uses bidirectional attention between
+image placeholder tokens.
+
+For more details, see the [ZAYA1-VL model card](https://huggingface.co/Zyphra/ZAYA1-VL-8B).
+
+This model was contributed by [JJJYmmm](https://github.com/JJJYmmm).
+
+## Usage examples
+
+```python
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+model_id = "Zyphra/ZAYA1-VL-8B"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(model_id, device_map="auto")
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "What do you see in the image?"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=100)
+generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, outputs)]
+print(processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+```
+
+## Zaya1VLConfig
+
+[[autodoc]] Zaya1VLConfig
+
+## Zaya1VLTextConfig
+
+[[autodoc]] Zaya1VLTextConfig
+
+## Zaya1VLVisionConfig
+
+[[autodoc]] Zaya1VLVisionConfig
+
+## Zaya1VLProcessor
+
+[[autodoc]] Zaya1VLProcessor
+
+## Zaya1VLModel
+
+[[autodoc]] Zaya1VLModel
+    - forward
+
+## Zaya1VLVisionModel
+
+[[autodoc]] Zaya1VLVisionModel
+    - forward
+
+## Zaya1VLTextModel
+
+[[autodoc]] Zaya1VLTextModel
+    - forward
+
+## Zaya1VLForConditionalGeneration
+
+[[autodoc]] Zaya1VLForConditionalGeneration
+    - forward
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -864,6 +864,33 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
         DynamicLayer.reorder_cache(self, beam_idx)
 
 
+class LinearAttentionAndSlidingWindowAttentionLayer(LinearAttentionLayer, DynamicSlidingWindowLayer):
+    # The dynamic sliding attention part makes it non-compileable
+    is_compileable = False
+
+    def __init__(self, config: PreTrainedConfig | None = None):
+        DynamicSlidingWindowLayer.__init__(self, config)
+        LinearAttentionLayer.__init__(self)
+
+    def lazy_initialization(self, *args, **kwargs) -> None:
+        # When the Attention cache is used with `update`, `lazy_initialization` is called with 2 positional args
+        if len(args) == 2 and len(kwargs) == 0:
+            DynamicSlidingWindowLayer.lazy_initialization(self, *args)
+        # Otherwise, for the LinearAttention cache, when it's called in `update_conv_state` or `update_recurrent_state`,
+        # it's always called with 1 single kwarg (cause it needs to know if it's for the conv or ssm states)
+        if len(args) == 0 and len(kwargs) == 1:
+            LinearAttentionLayer.lazy_initialization(self, **kwargs)
+
+    def reset(self) -> None:
+        LinearAttentionLayer.reset(self)
+        DynamicSlidingWindowLayer.reset(self)
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        LinearAttentionLayer.reorder_cache(self, beam_idx)
+        DynamicSlidingWindowLayer.reorder_cache(self, beam_idx)
+
+
 # Pre-register the standard layer types (some classes are shared between multiple types,
 # e.g. ``DynamicSlidingWindowLayer`` covers both ``"sliding_attention"`` and
 # ``"chunked_attention"`` — those need an explicit map entry rather than the
@@ -883,6 +910,7 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
         "moe": LinearAttentionLayer,
         # Hybrid layers (e.g. zamba / zamba2) carry both a linear-attention state and a dynamic-attention state.
         "hybrid": LinearAttentionAndFullAttentionLayer,
+        "hybrid_sliding": LinearAttentionAndSlidingWindowAttentionLayer,
     }
 )
 

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -71,7 +71,8 @@
     "attention",
     "sparse",
     "dense",
-    "hybrid",  # for layers that have both mamba and attention in zamba and zamba2
+    "hybrid",  # for zamba/zamba2/zaya1, which use full attention + conv states
+    "hybrid_sliding",  # for zaya1, which uses swa + conv states
     "moe",  # for nemotron_h, which uses either attention, mamba or moe
 )
 

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -479,6 +479,8 @@
     from .youtu import *
     from .zamba import *
     from .zamba2 import *
+    from .zaya import *
+    from .zaya1_vl import *
     from .zoedepth import *
 else:
     import sys

diff --git a/src/transformers/models/auto/auto_mappings.py b/src/transformers/models/auto/auto_mappings.py
@@ -643,6 +643,10 @@
         ("youtu", "YoutuConfig"),
         ("zamba", "ZambaConfig"),
         ("zamba2", "Zamba2Config"),
+        ("zaya", "ZayaConfig"),
+        ("zaya1_vl", "Zaya1VLConfig"),
+        ("zaya1_vl_text", "Zaya1VLTextConfig"),
+        ("zaya1_vl_vision", "Zaya1VLVisionConfig"),
         ("zoedepth", "ZoeDepthConfig"),
     ]
 )
@@ -858,6 +862,8 @@
         ("xclip_vision_model", "x_clip"),
         ("xlm-roberta", "xlm_roberta"),
         ("xlm-roberta-xl", "xlm_roberta_xl"),
+        ("zaya1_vl_text", "zaya1_vl"),
+        ("zaya1_vl_vision", "zaya1_vl"),
     ]
 )
 

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -154,6 +154,7 @@
             ("vit_msn", {"torchvision": "ViTImageProcessor", "pil": "ViTImageProcessorPil"}),
             ("vivit", {"torchvision": "VivitImageProcessor"}),
             ("xclip", {"torchvision": "CLIPImageProcessor", "pil": "CLIPImageProcessorPil"}),
+            ("zaya1_vl", {"torchvision": "Qwen2VLImageProcessor", "pil": "Qwen2VLImageProcessorPil"}),
         ]
     )
 

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -512,6 +512,9 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("youtu", "YoutuModel"),
         ("zamba", "ZambaModel"),
         ("zamba2", "Zamba2Model"),
+        ("zaya", "ZayaModel"),
+        ("zaya1_vl", "Zaya1VLModel"),
+        ("zaya1_vl_text", "Zaya1VLTextModel"),
     ]
 )
 
@@ -774,6 +777,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("youtu", "YoutuForCausalLM"),
         ("zamba", "ZambaForCausalLM"),
         ("zamba2", "Zamba2ForCausalLM"),
+        ("zaya", "ZayaForCausalLM"),
     ]
 )
 
@@ -1050,6 +1054,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("video_llava", "VideoLlavaForConditionalGeneration"),
         ("vipllava", "VipLlavaForConditionalGeneration"),
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
+        ("zaya1_vl", "Zaya1VLForConditionalGeneration"),
     ]
 )
 

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -189,6 +189,7 @@
             ("wavlm", "Wav2Vec2Processor"),
             ("whisper", "WhisperProcessor"),
             ("xclip", "XCLIPProcessor"),
+            ("zaya1_vl", "Zaya1VLProcessor"),
         ]
     )
 

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -340,6 +340,7 @@
         ("xlstm", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
         ("xmod", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
         ("yoso", "AlbertTokenizer" if is_tokenizers_available() else None),
+        ("zaya", "GemmaTokenizer" if is_tokenizers_available() else None),
     ]
 )
 

diff --git a/src/transformers/models/zaya/__init__.py b/src/transformers/models/zaya/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2026 Zyphra and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_zaya import *
+    from .modeling_zaya import *
+
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)