Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ def initialize(
name="ZImagePipeline",
task=PipelineTask.PIXEL_GENERATION,
default_encoding="bfloat16",
supported_encodings={"bfloat16"},
supported_encodings={"bfloat16", "float32"},
example_repo_ids=[
"Tongyi-MAI/Z-Image",
"Zyphra/Z-Image",
"Tongyi-MAI/Z-Image-Turbo",
],
pipeline_model=ZImagePipeline, # type: ignore[arg-type]
context_type=PixelContext,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,49 @@

import math

from max.dtype import DType
from max.experimental import functional as F
from max.experimental.nn import Linear, Module
from max.experimental.nn.norm import RMSNorm
from max.experimental.nn.sequential import ModuleList
from max.experimental.tensor import Tensor
from max.nn.attention.mask_config import MHAMaskVariant
from max.nn.kernels import flash_attention_gpu as _flash_attention_gpu

from ...flux2_modulev3.layers.embeddings import apply_rotary_emb
from max.nn.kernels import (
rope_ragged_with_position_ids as _rope_ragged_with_position_ids,
)

flash_attention_gpu = F.functional(_flash_attention_gpu)
rope_ragged_with_position_ids = F.functional(_rope_ragged_with_position_ids)


def _apply_zimage_qk_rope(
query: Tensor,
key: Tensor,
freqs_cis: Tensor,
) -> tuple[Tensor, Tensor]:
"""Apply RoPE using precomputed interleaved [cos, sin] frequencies."""
batch_size = query.shape[0]
seq_len = query.shape[1]
num_heads = query.shape[2]
head_dim = query.shape[3]

query_ragged = F.reshape(query, [batch_size * seq_len, num_heads, head_dim])
key_ragged = F.reshape(key, [batch_size * seq_len, num_heads, head_dim])

position_ids = F.arange(0, seq_len, dtype=DType.uint32, device=query.device)
position_ids = F.broadcast_to(position_ids[None, :], [batch_size, seq_len])
position_ids = F.reshape(position_ids, [batch_size * seq_len])

query_out = rope_ragged_with_position_ids(
query_ragged, freqs_cis, position_ids, interleaved=True
)
key_out = rope_ragged_with_position_ids(
key_ragged, freqs_cis, position_ids, interleaved=True
)
return (
F.reshape(query_out, [batch_size, seq_len, num_heads, head_dim]),
F.reshape(key_out, [batch_size, seq_len, num_heads, head_dim]),
)


class ZImageAttention(Module[..., Tensor]):
Expand All @@ -45,13 +77,12 @@ def __init__(
self.norm_q = RMSNorm(self.head_dim, eps=eps) if qk_norm else None
self.norm_k = RMSNorm(self.head_dim, eps=eps) if qk_norm else None

# Keep ModuleList naming for diffusers-compatible key loading.
self.to_out = ModuleList([Linear(dim, dim, bias=False)])
self.to_out = Linear(dim, dim, bias=False)

def forward(
self,
hidden_states: Tensor,
freqs_cis: tuple[Tensor, Tensor],
freqs_cis: Tensor,
) -> Tensor:
batch_size = hidden_states.shape[0]
seq_len = hidden_states.shape[1]
Expand All @@ -73,22 +104,7 @@ def forward(
if self.norm_k is not None:
key = self.norm_k(key)

query = apply_rotary_emb(
query,
freqs_cis,
use_real=True,
use_real_unbind_dim=-1,
sequence_dim=1,
)
key = apply_rotary_emb(
key,
freqs_cis,
use_real=True,
use_real_unbind_dim=-1,
sequence_dim=1,
)
query = query.cast(value.dtype)
key = key.cast(value.dtype)
query, key = _apply_zimage_qk_rope(query, key, freqs_cis)

out = flash_attention_gpu(
query,
Expand All @@ -99,4 +115,4 @@ def forward(
)

out = F.reshape(out, [batch_size, seq_len, self.inner_dim])
return self.to_out[0](out)
return self.to_out(out)
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
from max.experimental.nn import Linear, Module
from max.experimental.tensor import Tensor

from ...flux2_modulev3.layers.embeddings import get_1d_rotary_pos_embed


class TimestepEmbedder(Module[[Tensor], Tensor]):
def __init__(
Expand Down Expand Up @@ -67,7 +65,21 @@ def forward(self, t: Tensor) -> Tensor:
return t_emb


class RopeEmbedder(Module[[Tensor], tuple[Tensor, Tensor]]):
def _get_1d_rope_interleaved(
dim: int,
pos: Tensor,
theta: float = 10000.0,
) -> Tensor:
"""Compute 1-D RoPE in [cos, sin] interleaved pair format."""
half = dim // 2
freq_exp = F.arange(0, half, dtype=DType.float32, device=pos.device) / half
freq = 1.0 / (theta**freq_exp)
freqs = F.outer(pos, freq)
paired = F.stack([F.cos(freqs), F.sin(freqs)], axis=2)
return F.reshape(paired, [freqs.shape[0], dim])


class RopeEmbedder(Module[[Tensor], Tensor]):
def __init__(
self,
theta: float = 256.0,
Expand All @@ -76,28 +88,15 @@ def __init__(
self.theta = theta
self.axes_dims = axes_dims

def forward(self, ids: Tensor) -> tuple[Tensor, Tensor]:
if ids.rank != 2:
raise ValueError(f"Expected 2D ids tensor, got rank={ids.rank}")

if int(ids.shape[-1]) != len(self.axes_dims):
raise ValueError(
"ids last dimension must match axes_dims length "
f"({len(self.axes_dims)}), got {ids.shape[-1]}"
)

def forward(self, ids: Tensor) -> Tensor:
pos = ids.cast(DType.float32)
cos_out = []
sin_out = []
parts = []
for i in range(len(self.axes_dims)):
cos_i, sin_i = get_1d_rotary_pos_embed(
self.axes_dims[i],
pos[:, i],
theta=self.theta,
use_real=True,
repeat_interleave_real=True,
parts.append(
_get_1d_rope_interleaved(
self.axes_dims[i],
pos[:, i],
theta=self.theta,
)
)
cos_out.append(cos_i)
sin_out.append(sin_i)

return F.concat(cos_out, axis=-1), F.concat(sin_out, axis=-1)
return F.concat(parts, axis=-1)
Loading
Loading